Cleanup before merging to the master merge candidate branch

2020-06-24 15:13:15 +03:00
parent 0e4b39d6d7
commit f04e347c45
4 changed files with 52 additions and 269 deletions
--- a/samples/bwtest/CMakeLists.txt
+++ b/samples/bwtest/CMakeLists.txt
@@ -5,5 +5,5 @@ find_package(OpenMP)
 find_package(CUDAToolkit)
 add_executable(bwtest main.c)
-target_link_libraries(bwtest MPI::MPI_C OpenMP::OpenMP_C CUDA::cudart_static)
+target_link_libraries(bwtest MPI::MPI_C OpenMP::OpenMP_C CUDA::cudart_static CUDA::cuda_driver)
 target_compile_options(bwtest PRIVATE -O3)
--- a/samples/bwtest/main.c
+++ b/samples/bwtest/main.c
@@ -7,6 +7,7 @@
 #include <mpi.h>
 #include <cuda_runtime_api.h>
 #include <cuda.h> // CUDA driver API
 #include "timer_hires.h" // From src/common
@@ -56,6 +57,17 @@ allocDevice(const size_t bytes)
 static uint8_t*
 allocDevicePinned(const size_t bytes)
 {
    #define USE_CUDA_DRIVER_PINNING (1)
    #if USE_CUDA_DRIVER_PINNING
    uint8_t* arr = allocDevice(bytes);
    unsigned int flag = 1;
    CUresult retval = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, (CUdeviceptr)arr);
    errchk(retval == CUDA_SUCCESS);
    return arr;
    #else
    uint8_t* arr;
    // Standard (20 GiB/s internode, 85 GiB/s intranode)
    // const cudaError_t retval = cudaMalloc((void**)&arr, bytes);
@@ -65,8 +77,24 @@ allocDevicePinned(const size_t bytes)
    const cudaError_t retval = cudaMallocHost((void**)&arr, bytes);
    errchk(retval == cudaSuccess);
    return arr;
    #endif
 }
 /*
 static uint8_t*
 allocDevicePinned(const size_t bytes)
 {
    uint8_t* arr;
    // Standard (20 GiB/s internode, 85 GiB/s intranode)
    // const cudaError_t retval = cudaMalloc((void**)&arr, bytes);
    // Unified mem (5 GiB/s internode, 6 GiB/s intranode)
    // const cudaError_t retval = cudaMallocManaged((void**)&arr, bytes, cudaMemAttachGlobal);
    // Pinned (40 GiB/s internode, 10 GiB/s intranode)
    const cudaError_t retval = cudaMallocHost((void**)&arr, bytes);
    errchk(retval == cudaSuccess);
    return arr;
 }*/
 static void
 freeDevice(uint8_t* arr)
 {
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -2,7 +2,7 @@ find_package(CUDAToolkit)
 ## Astaroth Core
 add_library(astaroth_core STATIC device.cc node.cc astaroth.cc)
-target_link_libraries(astaroth_core astaroth_utils astaroth_kernels CUDA::cudart)
+target_link_libraries(astaroth_core astaroth_utils astaroth_kernels CUDA::cudart CUDA::cuda_driver)
 ## Options
 if (MPI_ENABLED)
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -10,13 +10,16 @@
 #include "kernels/kernels.h"
 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
 #define MPI_GPUDIRECT_DISABLED (0)
-#define DECOMPOSITION_AXES (3)
+#define MPI_GPUDIRECT_DISABLED (0) // Buffer through host memory, deprecated
 #define MPI_DECOMPOSITION_AXES (3)
 #define MPI_COMPUTE_ENABLED (1)
 #define MPI_COMM_ENABLED (1)
 #define MPI_INCL_CORNERS (0)
-#define MPI_USE_PINNED (1)
+#define MPI_USE_PINNED (1) // Do inter-node comm with pinned memory
 #define MPI_USE_CUDA_DRIVER_PINNING (0) // Pin with cuPointerSetAttribute, otherwise cudaMallocHost
 #include <cuda.h> // CUDA driver API (needed if MPI_USE_CUDA_DRIVER_PINNING is set)
 AcResult
 acDevicePrintInfo(const Device device)
@@ -530,7 +533,7 @@ morton3D(const uint64_t pid)
    uint64_t i, j, k;
    i = j = k = 0;
-    if (DECOMPOSITION_AXES == 3) {
+    if (MPI_DECOMPOSITION_AXES == 3) {
        for (int bit = 0; bit <= 21; ++bit) {
            const uint64_t mask = 0x1l << 3 * bit;
            k |= ((pid & (mask << 0)) >> 2 * bit) >> 0;
@@ -538,32 +541,22 @@ morton3D(const uint64_t pid)
            i |= ((pid & (mask << 2)) >> 2 * bit) >> 2;
        }
    }
    /*
    else if (DECOMPOSITION_AXES == 3) {
        for (int bit = 0; bit <= 21; ++bit) {
            const uint64_t mask = 0x1l << 3 * bit;
            i |= ((pid & (mask << 0)) >> 2 * bit) >> 0;
            j |= ((pid & (mask << 1)) >> 2 * bit) >> 1;
            k |= ((pid & (mask << 2)) >> 2 * bit) >> 2;
        }
    }
    */
    // Just a quick copy/paste for other decomp dims
-    else if (DECOMPOSITION_AXES == 2) {
+    else if (MPI_DECOMPOSITION_AXES == 2) {
        for (int bit = 0; bit <= 21; ++bit) {
            const uint64_t mask = 0x1l << 2 * bit;
            j |= ((pid & (mask << 0)) >> 1 * bit) >> 0;
            k |= ((pid & (mask << 1)) >> 1 * bit) >> 1;
        }
    }
-    else if (DECOMPOSITION_AXES == 1) {
+    else if (MPI_DECOMPOSITION_AXES == 1) {
        for (int bit = 0; bit <= 21; ++bit) {
            const uint64_t mask = 0x1l << 1 * bit;
            k |= ((pid & (mask << 0)) >> 0 * bit) >> 0;
        }
    }
    else {
-        fprintf(stderr, "Invalid DECOMPOSITION_AXES\n");
+        fprintf(stderr, "Invalid MPI_DECOMPOSITION_AXES\n");
        ERRCHK_ALWAYS(0);
    }
@@ -575,7 +568,7 @@ morton1D(const uint3_64 pid)
 {
    uint64_t i = 0;
-    if (DECOMPOSITION_AXES == 3) {
+    if (MPI_DECOMPOSITION_AXES == 3) {
        for (int bit = 0; bit <= 21; ++bit) {
            const uint64_t mask = 0x1l << bit;
            i |= ((pid.z & mask) << 0) << 2 * bit;
@@ -583,64 +576,26 @@ morton1D(const uint3_64 pid)
            i |= ((pid.x & mask) << 2) << 2 * bit;
        }
    }
-    /*
+    else if (MPI_DECOMPOSITION_AXES == 2) {
    else if (DECOMPOSITION_AXES == 3) {
        for (int bit = 0; bit <= 21; ++bit) {
            const uint64_t mask = 0x1l << bit;
            i |= ((pid.x & mask) << 0) << 2 * bit;
            i |= ((pid.y & mask) << 1) << 2 * bit;
            i |= ((pid.z & mask) << 2) << 2 * bit;
        }
    }*/
    else if (DECOMPOSITION_AXES == 2) {
        for (int bit = 0; bit <= 21; ++bit) {
            const uint64_t mask = 0x1l << bit;
            i |= ((pid.y & mask) << 0) << 1 * bit;
            i |= ((pid.z & mask) << 1) << 1 * bit;
        }
    }
-    else if (DECOMPOSITION_AXES == 1) {
+    else if (MPI_DECOMPOSITION_AXES == 1) {
        for (int bit = 0; bit <= 21; ++bit) {
            const uint64_t mask = 0x1l << bit;
            i |= ((pid.z & mask) << 0) << 0 * bit;
        }
    }
    else {
-        fprintf(stderr, "Invalid DECOMPOSITION_AXES\n");
+        fprintf(stderr, "Invalid MPI_DECOMPOSITION_AXES\n");
        ERRCHK_ALWAYS(0);
    }
    return i;
 }
 /*
 static uint3_64
 morton3D(const uint64_t pid)
 {
    uint64_t i, j, k;
    i = j = k = 0;
    for (int bit = 0; bit <= 21; ++bit) {
        const uint64_t mask = 0x1l << 3 * bit;
        i |= ((pid & (mask << 0)) >> 2 * bit) >> 0;
        j |= ((pid & (mask << 1)) >> 2 * bit) >> 1;
        k |= ((pid & (mask << 2)) >> 2 * bit) >> 2;
    }
    return (uint3_64){i, j, k};
 }
 static uint64_t
 morton1D(const uint3_64 pid)
 {
    uint64_t i = 0;
    for (int bit = 0; bit <= 21; ++bit) {
        const uint64_t mask = 0x1l << bit;
        i |= ((pid.x & mask) << 0) << 2 * bit;
        i |= ((pid.y & mask) << 1) << 2 * bit;
        i |= ((pid.z & mask) << 2) << 2 * bit;
    }
    return i;
 }
 */
 static uint3_64
 decompose(const uint64_t target)
@@ -701,9 +656,17 @@ acCreatePackedData(const int3 dims)
    const size_t bytes = dims.x * dims.y * dims.z * sizeof(data.data[0]) * NUM_VTXBUF_HANDLES;
    ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&data.data, bytes));
    #if MPI_USE_CUDA_DRIVER_PINNING
      ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&data.data_pinned, bytes));
      unsigned int flag = 1;
      CUresult retval = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, (CUdeviceptr)data.data_pinned);
      ERRCHK_ALWAYS(retval == CUDA_SUCCESS);
    #else
    ERRCHK_CUDA_ALWAYS(cudaMallocHost((void**)&data.data_pinned, bytes));
    // ERRCHK_CUDA_ALWAYS(cudaMallocManaged((void**)&data.data_pinned, bytes)); // Significantly
    // slower than pinned (38 ms vs. 125 ms)
    #fi // USE_CUDA_DRIVER_PINNING
    return data;
 }
@@ -1588,214 +1551,6 @@ acGridIntegrate(const Stream stream, const AcReal dt)
    return AC_SUCCESS;
 }
 AcResult
 acGridIntegrateORIGINAL(const Stream stream, const AcReal dt)
 {
    ERRCHK(grid.initialized);
    // acGridSynchronizeStream(stream);
    const Device device = grid.device;
    const int3 nn       = grid.nn;
    // CommData corner_data = grid.corner_data; // Do not rm: required for corners
    CommData edgex_data  = grid.edgex_data;
    CommData edgey_data  = grid.edgey_data;
    CommData edgez_data  = grid.edgez_data;
    CommData sidexy_data = grid.sidexy_data;
    CommData sidexz_data = grid.sidexz_data;
    CommData sideyz_data = grid.sideyz_data;
    acDeviceSynchronizeStream(device, stream);
    // Corners
    /*
    // Do not rm: required for corners
    const int3 corner_b0s[] = {
        (int3){0, 0, 0},
        (int3){NGHOST + nn.x, 0, 0},
        (int3){0, NGHOST + nn.y, 0},
        (int3){0, 0, NGHOST + nn.z},
        (int3){NGHOST + nn.x, NGHOST + nn.y, 0},
        (int3){NGHOST + nn.x, 0, NGHOST + nn.z},
        (int3){0, NGHOST + nn.y, NGHOST + nn.z},
        (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z},
    };
    */
    // Edges X
    const int3 edgex_b0s[] = {
        (int3){NGHOST, 0, 0},
        (int3){NGHOST, NGHOST + nn.y, 0},
        (int3){NGHOST, 0, NGHOST + nn.z},
        (int3){NGHOST, NGHOST + nn.y, NGHOST + nn.z},
    };
    // Edges Y
    const int3 edgey_b0s[] = {
        (int3){0, NGHOST, 0},
        (int3){NGHOST + nn.x, NGHOST, 0},
        (int3){0, NGHOST, NGHOST + nn.z},
        (int3){NGHOST + nn.x, NGHOST, NGHOST + nn.z},
    };
    // Edges Z
    const int3 edgez_b0s[] = {
        (int3){0, 0, NGHOST},
        (int3){NGHOST + nn.x, 0, NGHOST},
        (int3){0, NGHOST + nn.y, NGHOST},
        (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST},
    };
    // Sides XY
    const int3 sidexy_b0s[] = {
        (int3){NGHOST, NGHOST, 0},             //
        (int3){NGHOST, NGHOST, NGHOST + nn.z}, //
    };
    // Sides XZ
    const int3 sidexz_b0s[] = {
        (int3){NGHOST, 0, NGHOST},             //
        (int3){NGHOST, NGHOST + nn.y, NGHOST}, //
    };
    // Sides YZ
    const int3 sideyz_b0s[] = {
        (int3){0, NGHOST, NGHOST},             //
        (int3){NGHOST + nn.x, NGHOST, NGHOST}, //
    };
    for (int isubstep = 0; isubstep < 3; ++isubstep) {
 #if MPI_COMM_ENABLED
        // acPackCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners
        acPackCommData(device, edgex_b0s, &edgex_data);
        acPackCommData(device, edgey_b0s, &edgey_data);
        acPackCommData(device, edgez_b0s, &edgez_data);
        acPackCommData(device, sidexy_b0s, &sidexy_data);
        acPackCommData(device, sidexz_b0s, &sidexz_data);
        acPackCommData(device, sideyz_b0s, &sideyz_data);
 #endif
 #if MPI_COMM_ENABLED
        MPI_Barrier(MPI_COMM_WORLD);
 #if MPI_GPUDIRECT_DISABLED
        // acTransferCommDataToHost(device, &corner_data); // Do not rm: required for corners
        acTransferCommDataToHost(device, &edgex_data);
        acTransferCommDataToHost(device, &edgey_data);
        acTransferCommDataToHost(device, &edgez_data);
        acTransferCommDataToHost(device, &sidexy_data);
        acTransferCommDataToHost(device, &sidexz_data);
        acTransferCommDataToHost(device, &sideyz_data);
 #endif
        // acTransferCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners
        acTransferCommData(device, edgex_b0s, &edgex_data);
        acTransferCommData(device, edgey_b0s, &edgey_data);
        acTransferCommData(device, edgez_b0s, &edgez_data);
        acTransferCommData(device, sidexy_b0s, &sidexy_data);
        acTransferCommData(device, sidexz_b0s, &sidexz_data);
        acTransferCommData(device, sideyz_b0s, &sideyz_data);
 #endif // MPI_COMM_ENABLED
 #if MPI_COMPUTE_ENABLED
        //////////// INNER INTEGRATION //////////////
        {
            const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
            const int3 m2 = nn;
            acDeviceIntegrateSubstep(device, STREAM_16, isubstep, m1, m2, dt);
        }
 ////////////////////////////////////////////
 #endif // MPI_COMPUTE_ENABLED
 #if MPI_COMM_ENABLED
        // acTransferCommDataWait(corner_data); // Do not rm: required for corners
        acTransferCommDataWait(edgex_data);
        acTransferCommDataWait(edgey_data);
        acTransferCommDataWait(edgez_data);
        acTransferCommDataWait(sidexy_data);
        acTransferCommDataWait(sidexz_data);
        acTransferCommDataWait(sideyz_data);
 #if MPI_GPUDIRECT_DISABLED
        // acTransferCommDataToDevice(device, &corner_data); // Do not rm: required for corners
        acTransferCommDataToDevice(device, &edgex_data);
        acTransferCommDataToDevice(device, &edgey_data);
        acTransferCommDataToDevice(device, &edgez_data);
        acTransferCommDataToDevice(device, &sidexy_data);
        acTransferCommDataToDevice(device, &sidexz_data);
        acTransferCommDataToDevice(device, &sideyz_data);
 #endif
        // acUnpinCommData(device, &corner_data); // Do not rm: required for corners
        acUnpinCommData(device, &edgex_data);
        acUnpinCommData(device, &edgey_data);
        acUnpinCommData(device, &edgez_data);
        acUnpinCommData(device, &sidexy_data);
        acUnpinCommData(device, &sidexz_data);
        acUnpinCommData(device, &sideyz_data);
        // acUnpackCommData(device, corner_b0s, &corner_data);
        acUnpackCommData(device, edgex_b0s, &edgex_data);
        acUnpackCommData(device, edgey_b0s, &edgey_data);
        acUnpackCommData(device, edgez_b0s, &edgez_data);
        acUnpackCommData(device, sidexy_b0s, &sidexy_data);
        acUnpackCommData(device, sidexz_b0s, &sidexz_data);
        acUnpackCommData(device, sideyz_b0s, &sideyz_data);
        //////////// OUTER INTEGRATION //////////////
        // Wait for unpacking
        // acSyncCommData(corner_data); // Do not rm: required for corners
        acSyncCommData(edgex_data);
        acSyncCommData(edgey_data);
        acSyncCommData(edgez_data);
        acSyncCommData(sidexy_data);
        acSyncCommData(sidexz_data);
        acSyncCommData(sideyz_data);
 #endif // MPI_COMM_ENABLED
 #if MPI_COMPUTE_ENABLED
        { // Front
            const int3 m1 = (int3){NGHOST, NGHOST, NGHOST};
            const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST};
            acDeviceIntegrateSubstep(device, STREAM_0, isubstep, m1, m2, dt);
        }
        { // Back
            const int3 m1 = (int3){NGHOST, NGHOST, nn.z};
            const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST};
            acDeviceIntegrateSubstep(device, STREAM_1, isubstep, m1, m2, dt);
        }
        { // Bottom
            const int3 m1 = (int3){NGHOST, NGHOST, 2 * NGHOST};
            const int3 m2 = m1 + (int3){nn.x, NGHOST, nn.z - 2 * NGHOST};
            acDeviceIntegrateSubstep(device, STREAM_2, isubstep, m1, m2, dt);
        }
        { // Top
            const int3 m1 = (int3){NGHOST, nn.y, 2 * NGHOST};
            const int3 m2 = m1 + (int3){nn.x, NGHOST, nn.z - 2 * NGHOST};
            acDeviceIntegrateSubstep(device, STREAM_3, isubstep, m1, m2, dt);
        }
        { // Left
            const int3 m1 = (int3){NGHOST, 2 * NGHOST, 2 * NGHOST};
            const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST};
            acDeviceIntegrateSubstep(device, STREAM_4, isubstep, m1, m2, dt);
        }
        { // Right
            const int3 m1 = (int3){nn.x, 2 * NGHOST, 2 * NGHOST};
            const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST};
            acDeviceIntegrateSubstep(device, STREAM_5, isubstep, m1, m2, dt);
        }
 #endif // MPI_COMPUTE_ENABLED
        acDeviceSwapBuffers(device);
        acDeviceSynchronizeStream(device, STREAM_ALL); // Wait until inner and outer done
        ////////////////////////////////////////////
    }
    return AC_SUCCESS;
 }
 AcResult
 acGridPeriodicBoundconds(const Stream stream)
 {