Pulled useful changes from the benchmark branch. GPUDirect RDMA (unpinned) is now the default for MPI communication.

2020-07-29 16:39:24 +03:00
parent 6cab3586cf
commit 003c202e8c
4 changed files with 238 additions and 22 deletions
--- a/acc/src/code_generator.c
+++ b/acc/src/code_generator.c
@@ -43,7 +43,7 @@ static FILE* FHEADER    = NULL;
 static const char* dslheader_filename  = "user_defines.h";
 static const char* cudaheader_filename = "user_kernels.h";
-static const char* fheader_filename    = "astaroth_fortran.h";
+static const char* fheader_filename    = "astaroth.f90";
 // Forward declaration of yyparse
 int yyparse(void);
--- a/samples/benchmark/main.cc
+++ b/samples/benchmark/main.cc
@@ -207,24 +207,18 @@ main(int argc, char** argv)
                results[nth_percentile * num_iters], 100 * nth_percentile);
        char path[4096] = "";
-        if (test == TEST_STRONG_SCALING)
+        sprintf(path, "%s_%d.csv", test == TEST_STRONG_SCALING ? "strong" : "weak", nprocs);
            strncpy(path, "strong_scaling.csv", sizeof(path));
        else if (test == TEST_WEAK_SCALING)
            strncpy(path, "weak_scaling.csv", sizeof(path));
        else
            ERROR("Invalid test type");
        FILE* fp = fopen(path, "a");
        ERRCHK_ALWAYS(fp);
        // Format
-        // nprocs, measured (ms)
+        // nprocs, min, 50th perc, 90th perc, max
-        fprintf(fp, "%d, %g\n", nprocs, results[nth_percentile * num_iters]);
+        fprintf(fp, "%d, %g, %g, %g, %g\n", nprocs, results[0], results[0.5 * num_iters], results[nth_percentile * num_iters], results[num_iters-1]);
        fclose(fp);
    }
    /*
-const size_t num_iters      = 100;
+const size_t num_iters      = 1000;
 const double nth_percentile = 0.90;
 std::vector<double> results; // ms
--- a/samples/genbenchmarkscripts/main.c
+++ b/samples/genbenchmarkscripts/main.c
@@ -21,6 +21,7 @@ main(void)
        fprintf(fp, "#SBATCH --time=00:14:59\n");
        fprintf(fp, "#SBATCH --mem=32000\n");
        fprintf(fp, "#SBATCH --partition=gpu\n");
        fprintf(fp, "#SBATCH --cpus-per-task=10\n");
        // nprocs, nodes, gpus
        const int max_gpus_per_node = 4;
@@ -29,22 +30,30 @@ main(void)
        fprintf(fp, "#SBATCH --gres=gpu:v100:%d\n", gpus_per_node);
        fprintf(fp, "#SBATCH -n %d\n", nprocs);
        fprintf(fp, "#SBATCH -N %d\n", nodes);
-        fprintf(fp, "#SBATCH --exclusive\n");
+        //fprintf(fp, "#SBATCH --exclusive\n");
        if (nprocs > 4)
            fprintf(fp, "#SBATCH --ntasks-per-socket=2\n");
        // Modules
-        fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl\n");
+        // OpenMPI
        fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake openmpi nccl\n");
        // HPCX
        //fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl\n");
        fprintf(fp, "export UCX_MEMTYPE_CACHE=n\n");
        // Profile and run
-        fprintf(fp, "mkdir -p profile_%d\n", nprocs);
+        //fprintf(fp, "mkdir -p profile_%d\n", nprocs);
        const int nx = 256; // max size 1792;
        const int ny = nx;
        const int nz = nx;
        /*
        fprintf(fp,
                //"srun nvprof --annotate-mpi openmpi -o profile_%d/%%p.nvprof ./benchmark %d %d "
                //"%d\n",
                "srun ./benchmark %d %d %d\n", nx, ny, nz);
        */
        fprintf(fp, "srun ./benchmark %d %d %d\n", nx, ny, nz);
        fclose(fp);
    }
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -16,7 +16,7 @@
 #define MPI_COMPUTE_ENABLED (1)
 #define MPI_COMM_ENABLED (1)
 #define MPI_INCL_CORNERS (0)
-#define MPI_USE_PINNED (1)              // Do inter-node comm with pinned memory
+#define MPI_USE_PINNED (0)              // Do inter-node comm with pinned memory
 #define MPI_USE_CUDA_DRIVER_PINNING (0) // Pin with cuPointerSetAttribute, otherwise cudaMallocHost
 #include <cuda.h> // CUDA driver API (needed if MPI_USE_CUDA_DRIVER_PINNING is set)
@@ -115,7 +115,7 @@ AcResult
 acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_handle)
 {
    cudaSetDevice(id);
-    // cudaDeviceReset(); // Would be good for safety, but messes stuff up if we want to emulate
+    cudaDeviceReset(); // Would be good for safety, but messes stuff up if we want to emulate
    // multiple devices with a single GPU
    // Create Device
@@ -1169,10 +1169,8 @@ acTransferCommData(const Device device, //
 static void
 acTransferCommDataWait(const CommData data)
 {
-    for (size_t i = 0; i < data.count; ++i) {
+    MPI_Waitall(data.count, data.recv_reqs, MPI_STATUSES_IGNORE);
-        MPI_Wait(&data.send_reqs[i], MPI_STATUS_IGNORE);
+    MPI_Waitall(data.count, data.send_reqs, MPI_STATUSES_IGNORE);
        MPI_Wait(&data.recv_reqs[i], MPI_STATUS_IGNORE);
    }
 }
 typedef struct {
@@ -1337,8 +1335,10 @@ acGridStoreMesh(const Stream stream, AcMesh* host_mesh)
    return AC_SUCCESS;
 }
 /*
 // Unused
 AcResult
-acGridIntegrate(const Stream stream, const AcReal dt)
+acGridIntegratePipelined(const Stream stream, const AcReal dt)
 {
    ERRCHK(grid.initialized);
    acGridSynchronizeStream(stream);
@@ -1549,6 +1549,220 @@ acGridIntegrate(const Stream stream, const AcReal dt)
    acDeviceSynchronizeStream(device, STREAM_ALL); // Wait until inner and outer done
    return AC_SUCCESS;
 }
 */
 AcResult
 acGridIntegrate(const Stream stream, const AcReal dt)
 {
    ERRCHK(grid.initialized);
    acGridSynchronizeStream(stream);
    const Device device = grid.device;
    const int3 nn       = grid.nn;
 #if MPI_INCL_CORNERS
    CommData corner_data = grid.corner_data; // Do not rm: required for corners
 #endif                                       // MPI_INCL_CORNERS
    CommData edgex_data  = grid.edgex_data;
    CommData edgey_data  = grid.edgey_data;
    CommData edgez_data  = grid.edgez_data;
    CommData sidexy_data = grid.sidexy_data;
    CommData sidexz_data = grid.sidexz_data;
    CommData sideyz_data = grid.sideyz_data;
    acDeviceSynchronizeStream(device, stream);
 // Corners
 #if MPI_INCL_CORNERS
    // Do not rm: required for corners
    const int3 corner_b0s[] = {
        (int3){0, 0, 0},
        (int3){NGHOST + nn.x, 0, 0},
        (int3){0, NGHOST + nn.y, 0},
        (int3){0, 0, NGHOST + nn.z},
        (int3){NGHOST + nn.x, NGHOST + nn.y, 0},
        (int3){NGHOST + nn.x, 0, NGHOST + nn.z},
        (int3){0, NGHOST + nn.y, NGHOST + nn.z},
        (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z},
    };
 #endif // MPI_INCL_CORNERS
    // Edges X
    const int3 edgex_b0s[] = {
        (int3){NGHOST, 0, 0},
        (int3){NGHOST, NGHOST + nn.y, 0},
        (int3){NGHOST, 0, NGHOST + nn.z},
        (int3){NGHOST, NGHOST + nn.y, NGHOST + nn.z},
    };
    // Edges Y
    const int3 edgey_b0s[] = {
        (int3){0, NGHOST, 0},
        (int3){NGHOST + nn.x, NGHOST, 0},
        (int3){0, NGHOST, NGHOST + nn.z},
        (int3){NGHOST + nn.x, NGHOST, NGHOST + nn.z},
    };
    // Edges Z
    const int3 edgez_b0s[] = {
        (int3){0, 0, NGHOST},
        (int3){NGHOST + nn.x, 0, NGHOST},
        (int3){0, NGHOST + nn.y, NGHOST},
        (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST},
    };
    // Sides XY
    const int3 sidexy_b0s[] = {
        (int3){NGHOST, NGHOST, 0},             //
        (int3){NGHOST, NGHOST, NGHOST + nn.z}, //
    };
    // Sides XZ
    const int3 sidexz_b0s[] = {
        (int3){NGHOST, 0, NGHOST},             //
        (int3){NGHOST, NGHOST + nn.y, NGHOST}, //
    };
    // Sides YZ
    const int3 sideyz_b0s[] = {
        (int3){0, NGHOST, NGHOST},             //
        (int3){NGHOST + nn.x, NGHOST, NGHOST}, //
    };
    for (int isubstep = 0; isubstep < 3; ++isubstep) {
 #if MPI_COMM_ENABLED
 #if MPI_INCL_CORNERS
        acPackCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners
 #endif                                                    // MPI_INCL_CORNERS
        acPackCommData(device, edgex_b0s, &edgex_data);
        acPackCommData(device, edgey_b0s, &edgey_data);
        acPackCommData(device, edgez_b0s, &edgez_data);
        acPackCommData(device, sidexy_b0s, &sidexy_data);
        acPackCommData(device, sidexz_b0s, &sidexz_data);
        acPackCommData(device, sideyz_b0s, &sideyz_data);
 #endif
 #if MPI_COMM_ENABLED
        MPI_Barrier(MPI_COMM_WORLD);
 #if MPI_GPUDIRECT_DISABLED
 #if MPI_INCL_CORNERS
        acTransferCommDataToHost(device, &corner_data); // Do not rm: required for corners
 #endif                                                  // MPI_INCL_CORNERS
        acTransferCommDataToHost(device, &edgex_data);
        acTransferCommDataToHost(device, &edgey_data);
        acTransferCommDataToHost(device, &edgez_data);
        acTransferCommDataToHost(device, &sidexy_data);
        acTransferCommDataToHost(device, &sidexz_data);
        acTransferCommDataToHost(device, &sideyz_data);
 #endif
 #if MPI_INCL_CORNERS
        acTransferCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners
 #endif                                                        // MPI_INCL_CORNERS
        acTransferCommData(device, edgex_b0s, &edgex_data);
        acTransferCommData(device, edgey_b0s, &edgey_data);
        acTransferCommData(device, edgez_b0s, &edgez_data);
        acTransferCommData(device, sidexy_b0s, &sidexy_data);
        acTransferCommData(device, sidexz_b0s, &sidexz_data);
        acTransferCommData(device, sideyz_b0s, &sideyz_data);
 #endif // MPI_COMM_ENABLED
 #if MPI_COMPUTE_ENABLED
        //////////// INNER INTEGRATION //////////////
        {
            const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
            const int3 m2 = nn;
            acDeviceIntegrateSubstep(device, STREAM_16, isubstep, m1, m2, dt);
        }
 ////////////////////////////////////////////
 #endif // MPI_COMPUTE_ENABLED
 #if MPI_COMM_ENABLED
 #if MPI_INCL_CORNERS
        acTransferCommDataWait(corner_data); // Do not rm: required for corners
 #endif                                       // MPI_INCL_CORNERS
        acTransferCommDataWait(edgex_data);
        acTransferCommDataWait(edgey_data);
        acTransferCommDataWait(edgez_data);
        acTransferCommDataWait(sidexy_data);
        acTransferCommDataWait(sidexz_data);
        acTransferCommDataWait(sideyz_data);
 #if MPI_INCL_CORNERS
        acUnpinCommData(device, &corner_data); // Do not rm: required for corners
 #endif                                         // MPI_INCL_CORNERS
        acUnpinCommData(device, &edgex_data);
        acUnpinCommData(device, &edgey_data);
        acUnpinCommData(device, &edgez_data);
        acUnpinCommData(device, &sidexy_data);
        acUnpinCommData(device, &sidexz_data);
        acUnpinCommData(device, &sideyz_data);
 #if MPI_INCL_CORNERS
        acUnpackCommData(device, corner_b0s, &corner_data);
 #endif // MPI_INCL_CORNERS
        acUnpackCommData(device, edgex_b0s, &edgex_data);
        acUnpackCommData(device, edgey_b0s, &edgey_data);
        acUnpackCommData(device, edgez_b0s, &edgez_data);
        acUnpackCommData(device, sidexy_b0s, &sidexy_data);
        acUnpackCommData(device, sidexz_b0s, &sidexz_data);
        acUnpackCommData(device, sideyz_b0s, &sideyz_data);
 //////////// OUTER INTEGRATION //////////////
 // Wait for unpacking
 #if MPI_INCL_CORNERS
        acSyncCommData(corner_data); // Do not rm: required for corners
 #endif                               // MPI_INCL_CORNERS
        acSyncCommData(edgex_data);
        acSyncCommData(edgey_data);
        acSyncCommData(edgez_data);
        acSyncCommData(sidexy_data);
        acSyncCommData(sidexz_data);
        acSyncCommData(sideyz_data);
 #endif // MPI_COMM_ENABLED
 #if MPI_COMPUTE_ENABLED
        { // Front
            const int3 m1 = (int3){NGHOST, NGHOST, NGHOST};
            const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST};
            acDeviceIntegrateSubstep(device, STREAM_0, isubstep, m1, m2, dt);
        }
        { // Back
            const int3 m1 = (int3){NGHOST, NGHOST, nn.z};
            const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST};
            acDeviceIntegrateSubstep(device, STREAM_1, isubstep, m1, m2, dt);
        }
        { // Bottom
            const int3 m1 = (int3){NGHOST, NGHOST, 2 * NGHOST};
            const int3 m2 = m1 + (int3){nn.x, NGHOST, nn.z - 2 * NGHOST};
            acDeviceIntegrateSubstep(device, STREAM_2, isubstep, m1, m2, dt);
        }
        { // Top
            const int3 m1 = (int3){NGHOST, nn.y, 2 * NGHOST};
            const int3 m2 = m1 + (int3){nn.x, NGHOST, nn.z - 2 * NGHOST};
            acDeviceIntegrateSubstep(device, STREAM_3, isubstep, m1, m2, dt);
        }
        { // Left
            const int3 m1 = (int3){NGHOST, 2 * NGHOST, 2 * NGHOST};
            const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST};
            acDeviceIntegrateSubstep(device, STREAM_4, isubstep, m1, m2, dt);
        }
        { // Right
            const int3 m1 = (int3){nn.x, 2 * NGHOST, 2 * NGHOST};
            const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST};
            acDeviceIntegrateSubstep(device, STREAM_5, isubstep, m1, m2, dt);
        }
 #endif // MPI_COMPUTE_ENABLED
        acDeviceSwapBuffers(device);
        acDeviceSynchronizeStream(device, STREAM_ALL); // Wait until inner and outer done
        ////////////////////////////////////////////
    }
    return AC_SUCCESS;
 }
 AcResult
 acGridPeriodicBoundconds(const Stream stream)
@@ -1774,5 +1988,4 @@ acGridReduceVec(const Stream stream, const ReductionType rtype, const VertexBuff
    return acMPIReduceScal(local_result, rtype, result);
 }
 #endif // AC_MPI_ENABLED