diff --git a/src/core/device.cc b/src/core/device.cc index 56e2c30..2ab0a92 100644 --- a/src/core/device.cc +++ b/src/core/device.cc @@ -981,7 +981,7 @@ acTransferCommDataWait(const CommData data) } static AcResult -acDeviceCommunicateHalosMPI(const Device device) +acDeviceIntegrateMPI(const Device device, const AcReal dt) { // Configure const int3 nn = (int3){ @@ -1171,57 +1171,322 @@ acDeviceCommunicateHalosMPI(const Device device) timer_reset(&ttot); MPI_Barrier(MPI_COMM_WORLD); - acPackCommData(device, corner_a0s, &corner_data); - acPackCommData(device, edgex_a0s, &edgex_data); - acPackCommData(device, edgey_a0s, &edgey_data); - acPackCommData(device, edgez_a0s, &edgez_data); - acPackCommData(device, sidexy_a0s, &sidexy_data); - acPackCommData(device, sidexz_a0s, &sidexz_data); - acPackCommData(device, sideyz_a0s, &sideyz_data); + for (int isubstep = 0; isubstep < 3; ++isubstep) { + acPackCommData(device, corner_a0s, &corner_data); + acPackCommData(device, edgex_a0s, &edgex_data); + acPackCommData(device, edgey_a0s, &edgey_data); + acPackCommData(device, edgez_a0s, &edgez_data); + acPackCommData(device, sidexy_a0s, &sidexy_data); + acPackCommData(device, sidexz_a0s, &sidexz_data); + acPackCommData(device, sideyz_a0s, &sideyz_data); #if MPI_GPUDIRECT_DISABLED - acTransferCommDataToHost(device, &corner_data); - acTransferCommDataToHost(device, &edgex_data); - acTransferCommDataToHost(device, &edgey_data); - acTransferCommDataToHost(device, &edgez_data); - acTransferCommDataToHost(device, &sidexy_data); - acTransferCommDataToHost(device, &sidexz_data); - acTransferCommDataToHost(device, &sideyz_data); + acTransferCommDataToHost(device, &corner_data); + acTransferCommDataToHost(device, &edgex_data); + acTransferCommDataToHost(device, &edgey_data); + acTransferCommDataToHost(device, &edgez_data); + acTransferCommDataToHost(device, &sidexy_data); + acTransferCommDataToHost(device, &sidexz_data); + acTransferCommDataToHost(device, &sideyz_data); #endif - acTransferCommData(device, corner_a0s, corner_b0s, &corner_data); - acTransferCommData(device, edgex_a0s, edgex_b0s, &edgex_data); - acTransferCommData(device, edgey_a0s, edgey_b0s, &edgey_data); - acTransferCommData(device, edgez_a0s, edgez_b0s, &edgez_data); - acTransferCommData(device, sidexy_a0s, sidexy_b0s, &sidexy_data); - acTransferCommData(device, sidexz_a0s, sidexz_b0s, &sidexz_data); - acTransferCommData(device, sideyz_a0s, sideyz_b0s, &sideyz_data); + //////////// INNER INTEGRATION ////////////// + { + const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST}; + const int3 m2 = nn; + acDeviceIntegrateSubstep(device, STREAM_16, isubstep, m1, m2, dt); + } + //////////////////////////////////////////// - acTransferCommDataWait(corner_data); - acTransferCommDataWait(edgex_data); - acTransferCommDataWait(edgey_data); - acTransferCommDataWait(edgez_data); - acTransferCommDataWait(sidexy_data); - acTransferCommDataWait(sidexz_data); - acTransferCommDataWait(sideyz_data); + acTransferCommData(device, corner_a0s, corner_b0s, &corner_data); + acTransferCommData(device, edgex_a0s, edgex_b0s, &edgex_data); + acTransferCommData(device, edgey_a0s, edgey_b0s, &edgey_data); + acTransferCommData(device, edgez_a0s, edgez_b0s, &edgez_data); + acTransferCommData(device, sidexy_a0s, sidexy_b0s, &sidexy_data); + acTransferCommData(device, sidexz_a0s, sidexz_b0s, &sidexz_data); + acTransferCommData(device, sideyz_a0s, sideyz_b0s, &sideyz_data); + + acTransferCommDataWait(corner_data); + acTransferCommDataWait(edgex_data); + acTransferCommDataWait(edgey_data); + acTransferCommDataWait(edgez_data); + acTransferCommDataWait(sidexy_data); + acTransferCommDataWait(sidexz_data); + acTransferCommDataWait(sideyz_data); #if MPI_GPUDIRECT_DISABLED - acTransferCommDataToDevice(device, &corner_data); - acTransferCommDataToDevice(device, &edgex_data); - acTransferCommDataToDevice(device, &edgey_data); - acTransferCommDataToDevice(device, &edgez_data); - acTransferCommDataToDevice(device, &sidexy_data); - acTransferCommDataToDevice(device, &sidexz_data); - acTransferCommDataToDevice(device, &sideyz_data); + acTransferCommDataToDevice(device, &corner_data); + acTransferCommDataToDevice(device, &edgex_data); + acTransferCommDataToDevice(device, &edgey_data); + acTransferCommDataToDevice(device, &edgez_data); + acTransferCommDataToDevice(device, &sidexy_data); + acTransferCommDataToDevice(device, &sidexz_data); + acTransferCommDataToDevice(device, &sideyz_data); #endif - acUnpackCommData(device, corner_b0s, &corner_data); - acUnpackCommData(device, edgex_b0s, &edgex_data); - acUnpackCommData(device, edgey_b0s, &edgey_data); - acUnpackCommData(device, edgez_b0s, &edgez_data); - acUnpackCommData(device, sidexy_b0s, &sidexy_data); - acUnpackCommData(device, sidexz_b0s, &sidexz_data); - acUnpackCommData(device, sideyz_b0s, &sideyz_data); + acUnpackCommData(device, corner_b0s, &corner_data); + acUnpackCommData(device, edgex_b0s, &edgex_data); + acUnpackCommData(device, edgey_b0s, &edgey_data); + acUnpackCommData(device, edgez_b0s, &edgez_data); + acUnpackCommData(device, sidexy_b0s, &sidexy_data); + acUnpackCommData(device, sidexz_b0s, &sidexz_data); + acUnpackCommData(device, sideyz_b0s, &sideyz_data); + //////////// OUTER INTEGRATION ////////////// + acDeviceSynchronizeStream(device, STREAM_ALL); // Wait for unpacking + { // Front + const int3 m1 = (int3){NGHOST, NGHOST, NGHOST}; + const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST}; + acDeviceIntegrateSubstep(device, STREAM_0, isubstep, m1, m2, dt); + } + { // Back + const int3 m1 = (int3){NGHOST, NGHOST, nn.z}; + const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST}; + acDeviceIntegrateSubstep(device, STREAM_1, isubstep, m1, m2, dt); + } + { // Bottom + const int3 m1 = (int3){NGHOST, NGHOST, 2 * NGHOST}; + const int3 m2 = m1 + (int3){nn.x, NGHOST, nn.z - 2 * NGHOST}; + acDeviceIntegrateSubstep(device, STREAM_2, isubstep, m1, m2, dt); + } + { // Top + const int3 m1 = (int3){NGHOST, nn.y, 2 * NGHOST}; + const int3 m2 = m1 + (int3){nn.x, NGHOST, nn.z - 2 * NGHOST}; + acDeviceIntegrateSubstep(device, STREAM_3, isubstep, m1, m2, dt); + } + { // Left + const int3 m1 = (int3){NGHOST, 2 * NGHOST, 2 * NGHOST}; + const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST}; + acDeviceIntegrateSubstep(device, STREAM_4, isubstep, m1, m2, dt); + } + { // Right + const int3 m1 = (int3){nn.x, 2 * NGHOST, 2 * NGHOST}; + const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST}; + acDeviceIntegrateSubstep(device, STREAM_5, isubstep, m1, m2, dt); + } + acDeviceSwapBuffers(device); + acDeviceSynchronizeStream(device, STREAM_ALL); // Wait until inner and outer done + //////////////////////////////////////////// + } + + cudaDeviceSynchronize(); + MPI_Barrier(MPI_COMM_WORLD); + const double msec = timer_diff_nsec(ttot) / 1e6; + MPI_Barrier(MPI_COMM_WORLD); + + int pid, nprocs; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + if (!pid) { + printf("--- Total communication time per step: %f ms\n", msec); + + // Write out to file + FILE* fp = fopen("benchmark.result", "a+"); + fprintf(fp, "%d, %f\n", nprocs, msec); + fclose(fp); + } + + // Dealloc + acDestroyCommData(device, &corner_data); + acDestroyCommData(device, &edgex_data); + acDestroyCommData(device, &edgey_data); + acDestroyCommData(device, &edgez_data); + acDestroyCommData(device, &sidexy_data); + acDestroyCommData(device, &sidexz_data); + acDestroyCommData(device, &sideyz_data); + + return AC_SUCCESS; +} + +static AcResult +acDeviceCommunicateHalosMPI(const Device device) +{ + // Configure + const int3 nn = (int3){ + device->local_config.int_params[AC_nx], + device->local_config.int_params[AC_ny], + device->local_config.int_params[AC_nz], + }; + const AcReal dt = FLT_EPSILON; // TODO replace with the real one + + // Corners + const int3 corner_a0s[] = { + (int3){NGHOST, NGHOST, NGHOST}, // + (int3){nn.x, NGHOST, NGHOST}, // + (int3){NGHOST, nn.y, NGHOST}, // + (int3){nn.x, nn.y, NGHOST}, // + + (int3){NGHOST, NGHOST, nn.z}, // + (int3){nn.x, NGHOST, nn.z}, // + (int3){NGHOST, nn.y, nn.z}, // + (int3){nn.x, nn.y, nn.z}, + }; + const int3 corner_b0s[] = { + (int3){0, 0, 0}, + (int3){NGHOST + nn.x, 0, 0}, + (int3){0, NGHOST + nn.y, 0}, + (int3){NGHOST + nn.x, NGHOST + nn.y, 0}, + + (int3){0, 0, NGHOST + nn.z}, + (int3){NGHOST + nn.x, 0, NGHOST + nn.z}, + (int3){0, NGHOST + nn.y, NGHOST + nn.z}, + (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z}, + }; + const int3 corner_dims = (int3){NGHOST, NGHOST, NGHOST}; + + // Edges X + const int3 edgex_a0s[] = { + (int3){NGHOST, NGHOST, NGHOST}, // + (int3){NGHOST, nn.y, NGHOST}, // + + (int3){NGHOST, NGHOST, nn.z}, // + (int3){NGHOST, nn.y, nn.z}, // + }; + const int3 edgex_b0s[] = { + (int3){NGHOST, 0, 0}, + (int3){NGHOST, NGHOST + nn.y, 0}, + + (int3){NGHOST, 0, NGHOST + nn.z}, + (int3){NGHOST, NGHOST + nn.y, NGHOST + nn.z}, + }; + const int3 edgex_dims = (int3){nn.x, NGHOST, NGHOST}; + + // Edges Y + const int3 edgey_a0s[] = { + (int3){NGHOST, NGHOST, NGHOST}, // + (int3){nn.x, NGHOST, NGHOST}, // + + (int3){NGHOST, NGHOST, nn.z}, // + (int3){nn.x, NGHOST, nn.z}, // + }; + const int3 edgey_b0s[] = { + (int3){0, NGHOST, 0}, + (int3){NGHOST + nn.x, NGHOST, 0}, + + (int3){0, NGHOST, NGHOST + nn.z}, + (int3){NGHOST + nn.x, NGHOST, NGHOST + nn.z}, + }; + const int3 edgey_dims = (int3){NGHOST, nn.y, NGHOST}; + + // Edges Z + const int3 edgez_a0s[] = { + (int3){NGHOST, NGHOST, NGHOST}, // + (int3){nn.x, NGHOST, NGHOST}, // + + (int3){NGHOST, nn.y, NGHOST}, // + (int3){nn.x, nn.y, NGHOST}, // + }; + const int3 edgez_b0s[] = { + (int3){0, 0, NGHOST}, + (int3){NGHOST + nn.x, 0, NGHOST}, + + (int3){0, NGHOST + nn.y, NGHOST}, + (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST}, + }; + + const int3 edgez_dims = (int3){NGHOST, NGHOST, nn.z}; + + // Sides XY + const int3 sidexy_a0s[] = { + (int3){NGHOST, NGHOST, NGHOST}, // + (int3){NGHOST, NGHOST, nn.z}, // + }; + const int3 sidexy_b0s[] = { + (int3){NGHOST, NGHOST, 0}, // + (int3){NGHOST, NGHOST, NGHOST + nn.z}, // + }; + const int3 sidexy_dims = (int3){nn.x, nn.y, NGHOST}; + + // Sides XZ + const int3 sidexz_a0s[] = { + (int3){NGHOST, NGHOST, NGHOST}, // + (int3){NGHOST, nn.y, NGHOST}, // + }; + const int3 sidexz_b0s[] = { + (int3){NGHOST, 0, NGHOST}, // + (int3){NGHOST, NGHOST + nn.y, NGHOST}, // + }; + const int3 sidexz_dims = (int3){nn.x, NGHOST, nn.z}; + + // Sides YZ + const int3 sideyz_a0s[] = { + (int3){NGHOST, NGHOST, NGHOST}, // + (int3){nn.x, NGHOST, NGHOST}, // + }; + const int3 sideyz_b0s[] = { + (int3){0, NGHOST, NGHOST}, // + (int3){NGHOST + nn.x, NGHOST, NGHOST}, // + }; + const int3 sideyz_dims = (int3){NGHOST, nn.y, nn.z}; + + // Alloc + CommData corner_data = acCreateCommData(device, corner_dims, ARRAY_SIZE(corner_a0s)); + CommData edgex_data = acCreateCommData(device, edgex_dims, ARRAY_SIZE(edgex_a0s)); + CommData edgey_data = acCreateCommData(device, edgey_dims, ARRAY_SIZE(edgey_a0s)); + CommData edgez_data = acCreateCommData(device, edgez_dims, ARRAY_SIZE(edgez_a0s)); + CommData sidexy_data = acCreateCommData(device, sidexy_dims, ARRAY_SIZE(sidexy_a0s)); + CommData sidexz_data = acCreateCommData(device, sidexz_dims, ARRAY_SIZE(sidexz_a0s)); + CommData sideyz_data = acCreateCommData(device, sideyz_dims, ARRAY_SIZE(sideyz_a0s)); + + // Communicate + Timer ttot; + cudaDeviceSynchronize(); + MPI_Barrier(MPI_COMM_WORLD); + timer_reset(&ttot); + MPI_Barrier(MPI_COMM_WORLD); + + for (int isubstep = 0; isubstep < 3; ++isubstep) { + acPackCommData(device, corner_a0s, &corner_data); + acPackCommData(device, edgex_a0s, &edgex_data); + acPackCommData(device, edgey_a0s, &edgey_data); + acPackCommData(device, edgez_a0s, &edgez_data); + acPackCommData(device, sidexy_a0s, &sidexy_data); + acPackCommData(device, sidexz_a0s, &sidexz_data); + acPackCommData(device, sideyz_a0s, &sideyz_data); + +#if MPI_GPUDIRECT_DISABLED + acTransferCommDataToHost(device, &corner_data); + acTransferCommDataToHost(device, &edgex_data); + acTransferCommDataToHost(device, &edgey_data); + acTransferCommDataToHost(device, &edgez_data); + acTransferCommDataToHost(device, &sidexy_data); + acTransferCommDataToHost(device, &sidexz_data); + acTransferCommDataToHost(device, &sideyz_data); +#endif + + acTransferCommData(device, corner_a0s, corner_b0s, &corner_data); + acTransferCommData(device, edgex_a0s, edgex_b0s, &edgex_data); + acTransferCommData(device, edgey_a0s, edgey_b0s, &edgey_data); + acTransferCommData(device, edgez_a0s, edgez_b0s, &edgez_data); + acTransferCommData(device, sidexy_a0s, sidexy_b0s, &sidexy_data); + acTransferCommData(device, sidexz_a0s, sidexz_b0s, &sidexz_data); + acTransferCommData(device, sideyz_a0s, sideyz_b0s, &sideyz_data); + + acTransferCommDataWait(corner_data); + acTransferCommDataWait(edgex_data); + acTransferCommDataWait(edgey_data); + acTransferCommDataWait(edgez_data); + acTransferCommDataWait(sidexy_data); + acTransferCommDataWait(sidexz_data); + acTransferCommDataWait(sideyz_data); + +#if MPI_GPUDIRECT_DISABLED + acTransferCommDataToDevice(device, &corner_data); + acTransferCommDataToDevice(device, &edgex_data); + acTransferCommDataToDevice(device, &edgey_data); + acTransferCommDataToDevice(device, &edgez_data); + acTransferCommDataToDevice(device, &sidexy_data); + acTransferCommDataToDevice(device, &sidexz_data); + acTransferCommDataToDevice(device, &sideyz_data); +#endif + + acUnpackCommData(device, corner_b0s, &corner_data); + acUnpackCommData(device, edgex_b0s, &edgex_data); + acUnpackCommData(device, edgey_b0s, &edgey_data); + acUnpackCommData(device, edgez_b0s, &edgez_data); + acUnpackCommData(device, sidexy_b0s, &sidexy_data); + acUnpackCommData(device, sidexz_b0s, &sidexz_data); + acUnpackCommData(device, sideyz_b0s, &sideyz_data); + } cudaDeviceSynchronize(); MPI_Barrier(MPI_COMM_WORLD); @@ -1269,6 +1534,10 @@ acDeviceRunMPITest(void) // Create model and candidate meshes AcMeshInfo info; acLoadConfig(AC_DEFAULT_CONFIG, &info); + info.real_params[AC_inv_dsx] = AcReal(1.0) / info.real_params[AC_dsx]; + info.real_params[AC_inv_dsy] = AcReal(1.0) / info.real_params[AC_dsy]; + info.real_params[AC_inv_dsz] = AcReal(1.0) / info.real_params[AC_dsz]; + info.real_params[AC_cs2_sound] = info.real_params[AC_cs_sound] * info.real_params[AC_cs_sound]; AcMesh model, candidate; @@ -1340,6 +1609,8 @@ acDeviceRunMPITest(void) ////////////////////////////////////////////////////////////// // INTEGRATION & BOUNDCONDS//////////////////////////////////// + // acDeviceCommunicateHalosMPI(device); + acDeviceIntegrateMPI(device, FLT_EPSILON); acDeviceCommunicateHalosMPI(device); /////////////////////////////////////////////////////////////// @@ -1361,6 +1632,8 @@ acDeviceRunMPITest(void) // VERIFY //////////////////////////////////////////////////// if (pid == 0) { + // acMeshApplyPeriodicBounds(&model); + acModelIntegrateStep(model, FLT_EPSILON); acMeshApplyPeriodicBounds(&model); acVerifyMesh(model, candidate);