Added integration to MPI comm, now completes a full integration step. Works at least on 2 nodes

This commit is contained in:
jpekkila
2020-03-24 16:55:38 +02:00
parent 37d6ad18d3
commit d520835c42

View File

@@ -981,7 +981,7 @@ acTransferCommDataWait(const CommData data)
} }
static AcResult static AcResult
acDeviceCommunicateHalosMPI(const Device device) acDeviceIntegrateMPI(const Device device, const AcReal dt)
{ {
// Configure // Configure
const int3 nn = (int3){ const int3 nn = (int3){
@@ -1171,57 +1171,322 @@ acDeviceCommunicateHalosMPI(const Device device)
timer_reset(&ttot); timer_reset(&ttot);
MPI_Barrier(MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD);
acPackCommData(device, corner_a0s, &corner_data); for (int isubstep = 0; isubstep < 3; ++isubstep) {
acPackCommData(device, edgex_a0s, &edgex_data); acPackCommData(device, corner_a0s, &corner_data);
acPackCommData(device, edgey_a0s, &edgey_data); acPackCommData(device, edgex_a0s, &edgex_data);
acPackCommData(device, edgez_a0s, &edgez_data); acPackCommData(device, edgey_a0s, &edgey_data);
acPackCommData(device, sidexy_a0s, &sidexy_data); acPackCommData(device, edgez_a0s, &edgez_data);
acPackCommData(device, sidexz_a0s, &sidexz_data); acPackCommData(device, sidexy_a0s, &sidexy_data);
acPackCommData(device, sideyz_a0s, &sideyz_data); acPackCommData(device, sidexz_a0s, &sidexz_data);
acPackCommData(device, sideyz_a0s, &sideyz_data);
#if MPI_GPUDIRECT_DISABLED #if MPI_GPUDIRECT_DISABLED
acTransferCommDataToHost(device, &corner_data); acTransferCommDataToHost(device, &corner_data);
acTransferCommDataToHost(device, &edgex_data); acTransferCommDataToHost(device, &edgex_data);
acTransferCommDataToHost(device, &edgey_data); acTransferCommDataToHost(device, &edgey_data);
acTransferCommDataToHost(device, &edgez_data); acTransferCommDataToHost(device, &edgez_data);
acTransferCommDataToHost(device, &sidexy_data); acTransferCommDataToHost(device, &sidexy_data);
acTransferCommDataToHost(device, &sidexz_data); acTransferCommDataToHost(device, &sidexz_data);
acTransferCommDataToHost(device, &sideyz_data); acTransferCommDataToHost(device, &sideyz_data);
#endif #endif
acTransferCommData(device, corner_a0s, corner_b0s, &corner_data); //////////// INNER INTEGRATION //////////////
acTransferCommData(device, edgex_a0s, edgex_b0s, &edgex_data); {
acTransferCommData(device, edgey_a0s, edgey_b0s, &edgey_data); const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
acTransferCommData(device, edgez_a0s, edgez_b0s, &edgez_data); const int3 m2 = nn;
acTransferCommData(device, sidexy_a0s, sidexy_b0s, &sidexy_data); acDeviceIntegrateSubstep(device, STREAM_16, isubstep, m1, m2, dt);
acTransferCommData(device, sidexz_a0s, sidexz_b0s, &sidexz_data); }
acTransferCommData(device, sideyz_a0s, sideyz_b0s, &sideyz_data); ////////////////////////////////////////////
acTransferCommDataWait(corner_data); acTransferCommData(device, corner_a0s, corner_b0s, &corner_data);
acTransferCommDataWait(edgex_data); acTransferCommData(device, edgex_a0s, edgex_b0s, &edgex_data);
acTransferCommDataWait(edgey_data); acTransferCommData(device, edgey_a0s, edgey_b0s, &edgey_data);
acTransferCommDataWait(edgez_data); acTransferCommData(device, edgez_a0s, edgez_b0s, &edgez_data);
acTransferCommDataWait(sidexy_data); acTransferCommData(device, sidexy_a0s, sidexy_b0s, &sidexy_data);
acTransferCommDataWait(sidexz_data); acTransferCommData(device, sidexz_a0s, sidexz_b0s, &sidexz_data);
acTransferCommDataWait(sideyz_data); acTransferCommData(device, sideyz_a0s, sideyz_b0s, &sideyz_data);
acTransferCommDataWait(corner_data);
acTransferCommDataWait(edgex_data);
acTransferCommDataWait(edgey_data);
acTransferCommDataWait(edgez_data);
acTransferCommDataWait(sidexy_data);
acTransferCommDataWait(sidexz_data);
acTransferCommDataWait(sideyz_data);
#if MPI_GPUDIRECT_DISABLED #if MPI_GPUDIRECT_DISABLED
acTransferCommDataToDevice(device, &corner_data); acTransferCommDataToDevice(device, &corner_data);
acTransferCommDataToDevice(device, &edgex_data); acTransferCommDataToDevice(device, &edgex_data);
acTransferCommDataToDevice(device, &edgey_data); acTransferCommDataToDevice(device, &edgey_data);
acTransferCommDataToDevice(device, &edgez_data); acTransferCommDataToDevice(device, &edgez_data);
acTransferCommDataToDevice(device, &sidexy_data); acTransferCommDataToDevice(device, &sidexy_data);
acTransferCommDataToDevice(device, &sidexz_data); acTransferCommDataToDevice(device, &sidexz_data);
acTransferCommDataToDevice(device, &sideyz_data); acTransferCommDataToDevice(device, &sideyz_data);
#endif #endif
acUnpackCommData(device, corner_b0s, &corner_data); acUnpackCommData(device, corner_b0s, &corner_data);
acUnpackCommData(device, edgex_b0s, &edgex_data); acUnpackCommData(device, edgex_b0s, &edgex_data);
acUnpackCommData(device, edgey_b0s, &edgey_data); acUnpackCommData(device, edgey_b0s, &edgey_data);
acUnpackCommData(device, edgez_b0s, &edgez_data); acUnpackCommData(device, edgez_b0s, &edgez_data);
acUnpackCommData(device, sidexy_b0s, &sidexy_data); acUnpackCommData(device, sidexy_b0s, &sidexy_data);
acUnpackCommData(device, sidexz_b0s, &sidexz_data); acUnpackCommData(device, sidexz_b0s, &sidexz_data);
acUnpackCommData(device, sideyz_b0s, &sideyz_data); acUnpackCommData(device, sideyz_b0s, &sideyz_data);
//////////// OUTER INTEGRATION //////////////
acDeviceSynchronizeStream(device, STREAM_ALL); // Wait for unpacking
{ // Front
const int3 m1 = (int3){NGHOST, NGHOST, NGHOST};
const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST};
acDeviceIntegrateSubstep(device, STREAM_0, isubstep, m1, m2, dt);
}
{ // Back
const int3 m1 = (int3){NGHOST, NGHOST, nn.z};
const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST};
acDeviceIntegrateSubstep(device, STREAM_1, isubstep, m1, m2, dt);
}
{ // Bottom
const int3 m1 = (int3){NGHOST, NGHOST, 2 * NGHOST};
const int3 m2 = m1 + (int3){nn.x, NGHOST, nn.z - 2 * NGHOST};
acDeviceIntegrateSubstep(device, STREAM_2, isubstep, m1, m2, dt);
}
{ // Top
const int3 m1 = (int3){NGHOST, nn.y, 2 * NGHOST};
const int3 m2 = m1 + (int3){nn.x, NGHOST, nn.z - 2 * NGHOST};
acDeviceIntegrateSubstep(device, STREAM_3, isubstep, m1, m2, dt);
}
{ // Left
const int3 m1 = (int3){NGHOST, 2 * NGHOST, 2 * NGHOST};
const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST};
acDeviceIntegrateSubstep(device, STREAM_4, isubstep, m1, m2, dt);
}
{ // Right
const int3 m1 = (int3){nn.x, 2 * NGHOST, 2 * NGHOST};
const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST};
acDeviceIntegrateSubstep(device, STREAM_5, isubstep, m1, m2, dt);
}
acDeviceSwapBuffers(device);
acDeviceSynchronizeStream(device, STREAM_ALL); // Wait until inner and outer done
////////////////////////////////////////////
}
cudaDeviceSynchronize();
MPI_Barrier(MPI_COMM_WORLD);
const double msec = timer_diff_nsec(ttot) / 1e6;
MPI_Barrier(MPI_COMM_WORLD);
int pid, nprocs;
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
if (!pid) {
printf("--- Total communication time per step: %f ms\n", msec);
// Write out to file
FILE* fp = fopen("benchmark.result", "a+");
fprintf(fp, "%d, %f\n", nprocs, msec);
fclose(fp);
}
// Dealloc
acDestroyCommData(device, &corner_data);
acDestroyCommData(device, &edgex_data);
acDestroyCommData(device, &edgey_data);
acDestroyCommData(device, &edgez_data);
acDestroyCommData(device, &sidexy_data);
acDestroyCommData(device, &sidexz_data);
acDestroyCommData(device, &sideyz_data);
return AC_SUCCESS;
}
static AcResult
acDeviceCommunicateHalosMPI(const Device device)
{
// Configure
const int3 nn = (int3){
device->local_config.int_params[AC_nx],
device->local_config.int_params[AC_ny],
device->local_config.int_params[AC_nz],
};
const AcReal dt = FLT_EPSILON; // TODO replace with the real one
// Corners
const int3 corner_a0s[] = {
(int3){NGHOST, NGHOST, NGHOST}, //
(int3){nn.x, NGHOST, NGHOST}, //
(int3){NGHOST, nn.y, NGHOST}, //
(int3){nn.x, nn.y, NGHOST}, //
(int3){NGHOST, NGHOST, nn.z}, //
(int3){nn.x, NGHOST, nn.z}, //
(int3){NGHOST, nn.y, nn.z}, //
(int3){nn.x, nn.y, nn.z},
};
const int3 corner_b0s[] = {
(int3){0, 0, 0},
(int3){NGHOST + nn.x, 0, 0},
(int3){0, NGHOST + nn.y, 0},
(int3){NGHOST + nn.x, NGHOST + nn.y, 0},
(int3){0, 0, NGHOST + nn.z},
(int3){NGHOST + nn.x, 0, NGHOST + nn.z},
(int3){0, NGHOST + nn.y, NGHOST + nn.z},
(int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z},
};
const int3 corner_dims = (int3){NGHOST, NGHOST, NGHOST};
// Edges X
const int3 edgex_a0s[] = {
(int3){NGHOST, NGHOST, NGHOST}, //
(int3){NGHOST, nn.y, NGHOST}, //
(int3){NGHOST, NGHOST, nn.z}, //
(int3){NGHOST, nn.y, nn.z}, //
};
const int3 edgex_b0s[] = {
(int3){NGHOST, 0, 0},
(int3){NGHOST, NGHOST + nn.y, 0},
(int3){NGHOST, 0, NGHOST + nn.z},
(int3){NGHOST, NGHOST + nn.y, NGHOST + nn.z},
};
const int3 edgex_dims = (int3){nn.x, NGHOST, NGHOST};
// Edges Y
const int3 edgey_a0s[] = {
(int3){NGHOST, NGHOST, NGHOST}, //
(int3){nn.x, NGHOST, NGHOST}, //
(int3){NGHOST, NGHOST, nn.z}, //
(int3){nn.x, NGHOST, nn.z}, //
};
const int3 edgey_b0s[] = {
(int3){0, NGHOST, 0},
(int3){NGHOST + nn.x, NGHOST, 0},
(int3){0, NGHOST, NGHOST + nn.z},
(int3){NGHOST + nn.x, NGHOST, NGHOST + nn.z},
};
const int3 edgey_dims = (int3){NGHOST, nn.y, NGHOST};
// Edges Z
const int3 edgez_a0s[] = {
(int3){NGHOST, NGHOST, NGHOST}, //
(int3){nn.x, NGHOST, NGHOST}, //
(int3){NGHOST, nn.y, NGHOST}, //
(int3){nn.x, nn.y, NGHOST}, //
};
const int3 edgez_b0s[] = {
(int3){0, 0, NGHOST},
(int3){NGHOST + nn.x, 0, NGHOST},
(int3){0, NGHOST + nn.y, NGHOST},
(int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST},
};
const int3 edgez_dims = (int3){NGHOST, NGHOST, nn.z};
// Sides XY
const int3 sidexy_a0s[] = {
(int3){NGHOST, NGHOST, NGHOST}, //
(int3){NGHOST, NGHOST, nn.z}, //
};
const int3 sidexy_b0s[] = {
(int3){NGHOST, NGHOST, 0}, //
(int3){NGHOST, NGHOST, NGHOST + nn.z}, //
};
const int3 sidexy_dims = (int3){nn.x, nn.y, NGHOST};
// Sides XZ
const int3 sidexz_a0s[] = {
(int3){NGHOST, NGHOST, NGHOST}, //
(int3){NGHOST, nn.y, NGHOST}, //
};
const int3 sidexz_b0s[] = {
(int3){NGHOST, 0, NGHOST}, //
(int3){NGHOST, NGHOST + nn.y, NGHOST}, //
};
const int3 sidexz_dims = (int3){nn.x, NGHOST, nn.z};
// Sides YZ
const int3 sideyz_a0s[] = {
(int3){NGHOST, NGHOST, NGHOST}, //
(int3){nn.x, NGHOST, NGHOST}, //
};
const int3 sideyz_b0s[] = {
(int3){0, NGHOST, NGHOST}, //
(int3){NGHOST + nn.x, NGHOST, NGHOST}, //
};
const int3 sideyz_dims = (int3){NGHOST, nn.y, nn.z};
// Alloc
CommData corner_data = acCreateCommData(device, corner_dims, ARRAY_SIZE(corner_a0s));
CommData edgex_data = acCreateCommData(device, edgex_dims, ARRAY_SIZE(edgex_a0s));
CommData edgey_data = acCreateCommData(device, edgey_dims, ARRAY_SIZE(edgey_a0s));
CommData edgez_data = acCreateCommData(device, edgez_dims, ARRAY_SIZE(edgez_a0s));
CommData sidexy_data = acCreateCommData(device, sidexy_dims, ARRAY_SIZE(sidexy_a0s));
CommData sidexz_data = acCreateCommData(device, sidexz_dims, ARRAY_SIZE(sidexz_a0s));
CommData sideyz_data = acCreateCommData(device, sideyz_dims, ARRAY_SIZE(sideyz_a0s));
// Communicate
Timer ttot;
cudaDeviceSynchronize();
MPI_Barrier(MPI_COMM_WORLD);
timer_reset(&ttot);
MPI_Barrier(MPI_COMM_WORLD);
for (int isubstep = 0; isubstep < 3; ++isubstep) {
acPackCommData(device, corner_a0s, &corner_data);
acPackCommData(device, edgex_a0s, &edgex_data);
acPackCommData(device, edgey_a0s, &edgey_data);
acPackCommData(device, edgez_a0s, &edgez_data);
acPackCommData(device, sidexy_a0s, &sidexy_data);
acPackCommData(device, sidexz_a0s, &sidexz_data);
acPackCommData(device, sideyz_a0s, &sideyz_data);
#if MPI_GPUDIRECT_DISABLED
acTransferCommDataToHost(device, &corner_data);
acTransferCommDataToHost(device, &edgex_data);
acTransferCommDataToHost(device, &edgey_data);
acTransferCommDataToHost(device, &edgez_data);
acTransferCommDataToHost(device, &sidexy_data);
acTransferCommDataToHost(device, &sidexz_data);
acTransferCommDataToHost(device, &sideyz_data);
#endif
acTransferCommData(device, corner_a0s, corner_b0s, &corner_data);
acTransferCommData(device, edgex_a0s, edgex_b0s, &edgex_data);
acTransferCommData(device, edgey_a0s, edgey_b0s, &edgey_data);
acTransferCommData(device, edgez_a0s, edgez_b0s, &edgez_data);
acTransferCommData(device, sidexy_a0s, sidexy_b0s, &sidexy_data);
acTransferCommData(device, sidexz_a0s, sidexz_b0s, &sidexz_data);
acTransferCommData(device, sideyz_a0s, sideyz_b0s, &sideyz_data);
acTransferCommDataWait(corner_data);
acTransferCommDataWait(edgex_data);
acTransferCommDataWait(edgey_data);
acTransferCommDataWait(edgez_data);
acTransferCommDataWait(sidexy_data);
acTransferCommDataWait(sidexz_data);
acTransferCommDataWait(sideyz_data);
#if MPI_GPUDIRECT_DISABLED
acTransferCommDataToDevice(device, &corner_data);
acTransferCommDataToDevice(device, &edgex_data);
acTransferCommDataToDevice(device, &edgey_data);
acTransferCommDataToDevice(device, &edgez_data);
acTransferCommDataToDevice(device, &sidexy_data);
acTransferCommDataToDevice(device, &sidexz_data);
acTransferCommDataToDevice(device, &sideyz_data);
#endif
acUnpackCommData(device, corner_b0s, &corner_data);
acUnpackCommData(device, edgex_b0s, &edgex_data);
acUnpackCommData(device, edgey_b0s, &edgey_data);
acUnpackCommData(device, edgez_b0s, &edgez_data);
acUnpackCommData(device, sidexy_b0s, &sidexy_data);
acUnpackCommData(device, sidexz_b0s, &sidexz_data);
acUnpackCommData(device, sideyz_b0s, &sideyz_data);
}
cudaDeviceSynchronize(); cudaDeviceSynchronize();
MPI_Barrier(MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD);
@@ -1269,6 +1534,10 @@ acDeviceRunMPITest(void)
// Create model and candidate meshes // Create model and candidate meshes
AcMeshInfo info; AcMeshInfo info;
acLoadConfig(AC_DEFAULT_CONFIG, &info); acLoadConfig(AC_DEFAULT_CONFIG, &info);
info.real_params[AC_inv_dsx] = AcReal(1.0) / info.real_params[AC_dsx];
info.real_params[AC_inv_dsy] = AcReal(1.0) / info.real_params[AC_dsy];
info.real_params[AC_inv_dsz] = AcReal(1.0) / info.real_params[AC_dsz];
info.real_params[AC_cs2_sound] = info.real_params[AC_cs_sound] * info.real_params[AC_cs_sound];
AcMesh model, candidate; AcMesh model, candidate;
@@ -1340,6 +1609,8 @@ acDeviceRunMPITest(void)
////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////
// INTEGRATION & BOUNDCONDS//////////////////////////////////// // INTEGRATION & BOUNDCONDS////////////////////////////////////
// acDeviceCommunicateHalosMPI(device);
acDeviceIntegrateMPI(device, FLT_EPSILON);
acDeviceCommunicateHalosMPI(device); acDeviceCommunicateHalosMPI(device);
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
@@ -1361,6 +1632,8 @@ acDeviceRunMPITest(void)
// VERIFY //////////////////////////////////////////////////// // VERIFY ////////////////////////////////////////////////////
if (pid == 0) { if (pid == 0) {
// acMeshApplyPeriodicBounds(&model);
acModelIntegrateStep(model, FLT_EPSILON);
acMeshApplyPeriodicBounds(&model); acMeshApplyPeriodicBounds(&model);
acVerifyMesh(model, candidate); acVerifyMesh(model, candidate);