diff --git a/acc/src/code_generator.c b/acc/src/code_generator.c index 02aa6e0..aab16eb 100644 --- a/acc/src/code_generator.c +++ b/acc/src/code_generator.c @@ -43,7 +43,7 @@ static FILE* FHEADER = NULL; static const char* dslheader_filename = "user_defines.h"; static const char* cudaheader_filename = "user_kernels.h"; -static const char* fheader_filename = "astaroth_fortran.h"; +static const char* fheader_filename = "astaroth.f90"; // Forward declaration of yyparse int yyparse(void); diff --git a/samples/benchmark/main.cc b/samples/benchmark/main.cc index 962a316..16a99df 100644 --- a/samples/benchmark/main.cc +++ b/samples/benchmark/main.cc @@ -207,24 +207,18 @@ main(int argc, char** argv) results[nth_percentile * num_iters], 100 * nth_percentile); char path[4096] = ""; - if (test == TEST_STRONG_SCALING) - strncpy(path, "strong_scaling.csv", sizeof(path)); - else if (test == TEST_WEAK_SCALING) - strncpy(path, "weak_scaling.csv", sizeof(path)); - else - ERROR("Invalid test type"); + sprintf(path, "%s_%d.csv", test == TEST_STRONG_SCALING ? "strong" : "weak", nprocs); FILE* fp = fopen(path, "a"); ERRCHK_ALWAYS(fp); // Format - // nprocs, measured (ms) - fprintf(fp, "%d, %g\n", nprocs, results[nth_percentile * num_iters]); - + // nprocs, min, 50th perc, 90th perc, max + fprintf(fp, "%d, %g, %g, %g, %g\n", nprocs, results[0], results[0.5 * num_iters], results[nth_percentile * num_iters], results[num_iters-1]); fclose(fp); } /* -const size_t num_iters = 100; +const size_t num_iters = 1000; const double nth_percentile = 0.90; std::vector results; // ms diff --git a/samples/genbenchmarkscripts/main.c b/samples/genbenchmarkscripts/main.c index a45bf1a..ce782ed 100644 --- a/samples/genbenchmarkscripts/main.c +++ b/samples/genbenchmarkscripts/main.c @@ -21,6 +21,7 @@ main(void) fprintf(fp, "#SBATCH --time=00:14:59\n"); fprintf(fp, "#SBATCH --mem=32000\n"); fprintf(fp, "#SBATCH --partition=gpu\n"); + fprintf(fp, "#SBATCH --cpus-per-task=10\n"); // nprocs, nodes, gpus const int max_gpus_per_node = 4; @@ -29,22 +30,30 @@ main(void) fprintf(fp, "#SBATCH --gres=gpu:v100:%d\n", gpus_per_node); fprintf(fp, "#SBATCH -n %d\n", nprocs); fprintf(fp, "#SBATCH -N %d\n", nodes); - fprintf(fp, "#SBATCH --exclusive\n"); + //fprintf(fp, "#SBATCH --exclusive\n"); + if (nprocs > 4) + fprintf(fp, "#SBATCH --ntasks-per-socket=2\n"); // Modules - fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl\n"); + // OpenMPI + fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake openmpi nccl\n"); + // HPCX + //fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl\n"); fprintf(fp, "export UCX_MEMTYPE_CACHE=n\n"); // Profile and run - fprintf(fp, "mkdir -p profile_%d\n", nprocs); + //fprintf(fp, "mkdir -p profile_%d\n", nprocs); const int nx = 256; // max size 1792; const int ny = nx; const int nz = nx; + /* fprintf(fp, //"srun nvprof --annotate-mpi openmpi -o profile_%d/%%p.nvprof ./benchmark %d %d " //"%d\n", "srun ./benchmark %d %d %d\n", nx, ny, nz); + */ + fprintf(fp, "srun ./benchmark %d %d %d\n", nx, ny, nz); fclose(fp); } diff --git a/src/core/device.cc b/src/core/device.cc index 689eaf7..e465017 100644 --- a/src/core/device.cc +++ b/src/core/device.cc @@ -16,7 +16,7 @@ #define MPI_COMPUTE_ENABLED (1) #define MPI_COMM_ENABLED (1) #define MPI_INCL_CORNERS (0) -#define MPI_USE_PINNED (1) // Do inter-node comm with pinned memory +#define MPI_USE_PINNED (0) // Do inter-node comm with pinned memory #define MPI_USE_CUDA_DRIVER_PINNING (0) // Pin with cuPointerSetAttribute, otherwise cudaMallocHost #include // CUDA driver API (needed if MPI_USE_CUDA_DRIVER_PINNING is set) @@ -115,7 +115,7 @@ AcResult acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_handle) { cudaSetDevice(id); - // cudaDeviceReset(); // Would be good for safety, but messes stuff up if we want to emulate + cudaDeviceReset(); // Would be good for safety, but messes stuff up if we want to emulate // multiple devices with a single GPU // Create Device @@ -1169,10 +1169,8 @@ acTransferCommData(const Device device, // static void acTransferCommDataWait(const CommData data) { - for (size_t i = 0; i < data.count; ++i) { - MPI_Wait(&data.send_reqs[i], MPI_STATUS_IGNORE); - MPI_Wait(&data.recv_reqs[i], MPI_STATUS_IGNORE); - } + MPI_Waitall(data.count, data.recv_reqs, MPI_STATUSES_IGNORE); + MPI_Waitall(data.count, data.send_reqs, MPI_STATUSES_IGNORE); } typedef struct { @@ -1337,8 +1335,10 @@ acGridStoreMesh(const Stream stream, AcMesh* host_mesh) return AC_SUCCESS; } +/* +// Unused AcResult -acGridIntegrate(const Stream stream, const AcReal dt) +acGridIntegratePipelined(const Stream stream, const AcReal dt) { ERRCHK(grid.initialized); acGridSynchronizeStream(stream); @@ -1549,6 +1549,220 @@ acGridIntegrate(const Stream stream, const AcReal dt) acDeviceSynchronizeStream(device, STREAM_ALL); // Wait until inner and outer done return AC_SUCCESS; } +*/ + +AcResult +acGridIntegrate(const Stream stream, const AcReal dt) +{ + ERRCHK(grid.initialized); + acGridSynchronizeStream(stream); + + const Device device = grid.device; + const int3 nn = grid.nn; +#if MPI_INCL_CORNERS + CommData corner_data = grid.corner_data; // Do not rm: required for corners +#endif // MPI_INCL_CORNERS + CommData edgex_data = grid.edgex_data; + CommData edgey_data = grid.edgey_data; + CommData edgez_data = grid.edgez_data; + CommData sidexy_data = grid.sidexy_data; + CommData sidexz_data = grid.sidexz_data; + CommData sideyz_data = grid.sideyz_data; + + acDeviceSynchronizeStream(device, stream); + +// Corners +#if MPI_INCL_CORNERS + // Do not rm: required for corners + const int3 corner_b0s[] = { + (int3){0, 0, 0}, + (int3){NGHOST + nn.x, 0, 0}, + (int3){0, NGHOST + nn.y, 0}, + (int3){0, 0, NGHOST + nn.z}, + + (int3){NGHOST + nn.x, NGHOST + nn.y, 0}, + (int3){NGHOST + nn.x, 0, NGHOST + nn.z}, + (int3){0, NGHOST + nn.y, NGHOST + nn.z}, + (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z}, + }; +#endif // MPI_INCL_CORNERS + + // Edges X + const int3 edgex_b0s[] = { + (int3){NGHOST, 0, 0}, + (int3){NGHOST, NGHOST + nn.y, 0}, + + (int3){NGHOST, 0, NGHOST + nn.z}, + (int3){NGHOST, NGHOST + nn.y, NGHOST + nn.z}, + }; + + // Edges Y + const int3 edgey_b0s[] = { + (int3){0, NGHOST, 0}, + (int3){NGHOST + nn.x, NGHOST, 0}, + + (int3){0, NGHOST, NGHOST + nn.z}, + (int3){NGHOST + nn.x, NGHOST, NGHOST + nn.z}, + }; + + // Edges Z + const int3 edgez_b0s[] = { + (int3){0, 0, NGHOST}, + (int3){NGHOST + nn.x, 0, NGHOST}, + + (int3){0, NGHOST + nn.y, NGHOST}, + (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST}, + }; + + // Sides XY + const int3 sidexy_b0s[] = { + (int3){NGHOST, NGHOST, 0}, // + (int3){NGHOST, NGHOST, NGHOST + nn.z}, // + }; + + // Sides XZ + const int3 sidexz_b0s[] = { + (int3){NGHOST, 0, NGHOST}, // + (int3){NGHOST, NGHOST + nn.y, NGHOST}, // + }; + + // Sides YZ + const int3 sideyz_b0s[] = { + (int3){0, NGHOST, NGHOST}, // + (int3){NGHOST + nn.x, NGHOST, NGHOST}, // + }; + + for (int isubstep = 0; isubstep < 3; ++isubstep) { + +#if MPI_COMM_ENABLED +#if MPI_INCL_CORNERS + acPackCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners +#endif // MPI_INCL_CORNERS + acPackCommData(device, edgex_b0s, &edgex_data); + acPackCommData(device, edgey_b0s, &edgey_data); + acPackCommData(device, edgez_b0s, &edgez_data); + acPackCommData(device, sidexy_b0s, &sidexy_data); + acPackCommData(device, sidexz_b0s, &sidexz_data); + acPackCommData(device, sideyz_b0s, &sideyz_data); +#endif + +#if MPI_COMM_ENABLED + MPI_Barrier(MPI_COMM_WORLD); + +#if MPI_GPUDIRECT_DISABLED +#if MPI_INCL_CORNERS + acTransferCommDataToHost(device, &corner_data); // Do not rm: required for corners +#endif // MPI_INCL_CORNERS + acTransferCommDataToHost(device, &edgex_data); + acTransferCommDataToHost(device, &edgey_data); + acTransferCommDataToHost(device, &edgez_data); + acTransferCommDataToHost(device, &sidexy_data); + acTransferCommDataToHost(device, &sidexz_data); + acTransferCommDataToHost(device, &sideyz_data); +#endif +#if MPI_INCL_CORNERS + acTransferCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners +#endif // MPI_INCL_CORNERS + acTransferCommData(device, edgex_b0s, &edgex_data); + acTransferCommData(device, edgey_b0s, &edgey_data); + acTransferCommData(device, edgez_b0s, &edgez_data); + acTransferCommData(device, sidexy_b0s, &sidexy_data); + acTransferCommData(device, sidexz_b0s, &sidexz_data); + acTransferCommData(device, sideyz_b0s, &sideyz_data); +#endif // MPI_COMM_ENABLED + +#if MPI_COMPUTE_ENABLED + //////////// INNER INTEGRATION ////////////// + { + const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST}; + const int3 m2 = nn; + acDeviceIntegrateSubstep(device, STREAM_16, isubstep, m1, m2, dt); + } +//////////////////////////////////////////// +#endif // MPI_COMPUTE_ENABLED + +#if MPI_COMM_ENABLED +#if MPI_INCL_CORNERS + acTransferCommDataWait(corner_data); // Do not rm: required for corners +#endif // MPI_INCL_CORNERS + acTransferCommDataWait(edgex_data); + acTransferCommDataWait(edgey_data); + acTransferCommDataWait(edgez_data); + acTransferCommDataWait(sidexy_data); + acTransferCommDataWait(sidexz_data); + acTransferCommDataWait(sideyz_data); + +#if MPI_INCL_CORNERS + acUnpinCommData(device, &corner_data); // Do not rm: required for corners +#endif // MPI_INCL_CORNERS + acUnpinCommData(device, &edgex_data); + acUnpinCommData(device, &edgey_data); + acUnpinCommData(device, &edgez_data); + acUnpinCommData(device, &sidexy_data); + acUnpinCommData(device, &sidexz_data); + acUnpinCommData(device, &sideyz_data); + +#if MPI_INCL_CORNERS + acUnpackCommData(device, corner_b0s, &corner_data); +#endif // MPI_INCL_CORNERS + acUnpackCommData(device, edgex_b0s, &edgex_data); + acUnpackCommData(device, edgey_b0s, &edgey_data); + acUnpackCommData(device, edgez_b0s, &edgez_data); + acUnpackCommData(device, sidexy_b0s, &sidexy_data); + acUnpackCommData(device, sidexz_b0s, &sidexz_data); + acUnpackCommData(device, sideyz_b0s, &sideyz_data); +//////////// OUTER INTEGRATION ////////////// + +// Wait for unpacking +#if MPI_INCL_CORNERS + acSyncCommData(corner_data); // Do not rm: required for corners +#endif // MPI_INCL_CORNERS + acSyncCommData(edgex_data); + acSyncCommData(edgey_data); + acSyncCommData(edgez_data); + acSyncCommData(sidexy_data); + acSyncCommData(sidexz_data); + acSyncCommData(sideyz_data); +#endif // MPI_COMM_ENABLED +#if MPI_COMPUTE_ENABLED + { // Front + const int3 m1 = (int3){NGHOST, NGHOST, NGHOST}; + const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST}; + acDeviceIntegrateSubstep(device, STREAM_0, isubstep, m1, m2, dt); + } + { // Back + const int3 m1 = (int3){NGHOST, NGHOST, nn.z}; + const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST}; + acDeviceIntegrateSubstep(device, STREAM_1, isubstep, m1, m2, dt); + } + { // Bottom + const int3 m1 = (int3){NGHOST, NGHOST, 2 * NGHOST}; + const int3 m2 = m1 + (int3){nn.x, NGHOST, nn.z - 2 * NGHOST}; + acDeviceIntegrateSubstep(device, STREAM_2, isubstep, m1, m2, dt); + } + { // Top + const int3 m1 = (int3){NGHOST, nn.y, 2 * NGHOST}; + const int3 m2 = m1 + (int3){nn.x, NGHOST, nn.z - 2 * NGHOST}; + acDeviceIntegrateSubstep(device, STREAM_3, isubstep, m1, m2, dt); + } + { // Left + const int3 m1 = (int3){NGHOST, 2 * NGHOST, 2 * NGHOST}; + const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST}; + acDeviceIntegrateSubstep(device, STREAM_4, isubstep, m1, m2, dt); + } + { // Right + const int3 m1 = (int3){nn.x, 2 * NGHOST, 2 * NGHOST}; + const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST}; + acDeviceIntegrateSubstep(device, STREAM_5, isubstep, m1, m2, dt); + } +#endif // MPI_COMPUTE_ENABLED + acDeviceSwapBuffers(device); + acDeviceSynchronizeStream(device, STREAM_ALL); // Wait until inner and outer done + //////////////////////////////////////////// + } + + return AC_SUCCESS; +} AcResult acGridPeriodicBoundconds(const Stream stream) @@ -1774,5 +1988,4 @@ acGridReduceVec(const Stream stream, const ReductionType rtype, const VertexBuff return acMPIReduceScal(local_result, rtype, result); } - #endif // AC_MPI_ENABLED