WIP further MPI optimizations
This commit is contained in:
@@ -1171,6 +1171,8 @@ acDeviceIntegrateMPI(const Device device, const AcReal dt)
|
|||||||
timer_reset(&ttot);
|
timer_reset(&ttot);
|
||||||
MPI_Barrier(MPI_COMM_WORLD);
|
MPI_Barrier(MPI_COMM_WORLD);
|
||||||
|
|
||||||
|
const int num_iterations = 1;
|
||||||
|
for (int i = 0; i < num_iterations; ++i) {
|
||||||
for (int isubstep = 0; isubstep < 3; ++isubstep) {
|
for (int isubstep = 0; isubstep < 3; ++isubstep) {
|
||||||
acPackCommData(device, corner_a0s, &corner_data);
|
acPackCommData(device, corner_a0s, &corner_data);
|
||||||
acPackCommData(device, edgex_a0s, &edgex_data);
|
acPackCommData(device, edgex_a0s, &edgex_data);
|
||||||
@@ -1267,6 +1269,7 @@ acDeviceIntegrateMPI(const Device device, const AcReal dt)
|
|||||||
acDeviceSynchronizeStream(device, STREAM_ALL); // Wait until inner and outer done
|
acDeviceSynchronizeStream(device, STREAM_ALL); // Wait until inner and outer done
|
||||||
////////////////////////////////////////////
|
////////////////////////////////////////////
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
cudaDeviceSynchronize();
|
cudaDeviceSynchronize();
|
||||||
MPI_Barrier(MPI_COMM_WORLD);
|
MPI_Barrier(MPI_COMM_WORLD);
|
||||||
@@ -1277,7 +1280,8 @@ acDeviceIntegrateMPI(const Device device, const AcReal dt)
|
|||||||
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
|
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
|
||||||
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
||||||
if (!pid) {
|
if (!pid) {
|
||||||
printf("--- Total communication time per step: %f ms\n", msec);
|
printf("--- Total communication time per step w/ integration: %f ms\n",
|
||||||
|
msec / num_iterations);
|
||||||
|
|
||||||
// Write out to file
|
// Write out to file
|
||||||
FILE* fp = fopen("benchmark.result", "a+");
|
FILE* fp = fopen("benchmark.result", "a+");
|
||||||
@@ -1434,7 +1438,6 @@ acDeviceCommunicateHalosMPI(const Device device)
|
|||||||
timer_reset(&ttot);
|
timer_reset(&ttot);
|
||||||
MPI_Barrier(MPI_COMM_WORLD);
|
MPI_Barrier(MPI_COMM_WORLD);
|
||||||
|
|
||||||
for (int isubstep = 0; isubstep < 3; ++isubstep) {
|
|
||||||
acPackCommData(device, corner_a0s, &corner_data);
|
acPackCommData(device, corner_a0s, &corner_data);
|
||||||
acPackCommData(device, edgex_a0s, &edgex_data);
|
acPackCommData(device, edgex_a0s, &edgex_data);
|
||||||
acPackCommData(device, edgey_a0s, &edgey_data);
|
acPackCommData(device, edgey_a0s, &edgey_data);
|
||||||
@@ -1486,7 +1489,6 @@ acDeviceCommunicateHalosMPI(const Device device)
|
|||||||
acUnpackCommData(device, sidexy_b0s, &sidexy_data);
|
acUnpackCommData(device, sidexy_b0s, &sidexy_data);
|
||||||
acUnpackCommData(device, sidexz_b0s, &sidexz_data);
|
acUnpackCommData(device, sidexz_b0s, &sidexz_data);
|
||||||
acUnpackCommData(device, sideyz_b0s, &sideyz_data);
|
acUnpackCommData(device, sideyz_b0s, &sideyz_data);
|
||||||
}
|
|
||||||
|
|
||||||
cudaDeviceSynchronize();
|
cudaDeviceSynchronize();
|
||||||
MPI_Barrier(MPI_COMM_WORLD);
|
MPI_Barrier(MPI_COMM_WORLD);
|
||||||
@@ -1497,7 +1499,7 @@ acDeviceCommunicateHalosMPI(const Device device)
|
|||||||
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
|
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
|
||||||
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
||||||
if (!pid) {
|
if (!pid) {
|
||||||
printf("--- Total communication time per step: %f ms\n", msec);
|
printf("--- Total communication time per substep (comm): %f ms\n", msec);
|
||||||
|
|
||||||
// Write out to file
|
// Write out to file
|
||||||
FILE* fp = fopen("benchmark.result", "a+");
|
FILE* fp = fopen("benchmark.result", "a+");
|
||||||
@@ -1517,6 +1519,13 @@ acDeviceCommunicateHalosMPI(const Device device)
|
|||||||
return AC_SUCCESS;
|
return AC_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
static int3
|
||||||
|
findOptimalDecomposition(const int3 nn)
|
||||||
|
{
|
||||||
|
int3 decomposition = (int3){1, 1, 1};
|
||||||
|
}*/
|
||||||
|
|
||||||
AcResult
|
AcResult
|
||||||
acDeviceRunMPITest(void)
|
acDeviceRunMPITest(void)
|
||||||
{
|
{
|
||||||
@@ -1632,7 +1641,6 @@ acDeviceRunMPITest(void)
|
|||||||
|
|
||||||
// VERIFY ////////////////////////////////////////////////////
|
// VERIFY ////////////////////////////////////////////////////
|
||||||
if (pid == 0) {
|
if (pid == 0) {
|
||||||
// acMeshApplyPeriodicBounds(&model);
|
|
||||||
acModelIntegrateStep(model, FLT_EPSILON);
|
acModelIntegrateStep(model, FLT_EPSILON);
|
||||||
acMeshApplyPeriodicBounds(&model);
|
acMeshApplyPeriodicBounds(&model);
|
||||||
|
|
||||||
@@ -1657,3 +1665,41 @@ acDeviceRunMPITest(void)
|
|||||||
return AC_FAILURE;
|
return AC_FAILURE;
|
||||||
}
|
}
|
||||||
#endif // AC_MPI_ENABLED
|
#endif // AC_MPI_ENABLED
|
||||||
|
|
||||||
|
/*
|
||||||
|
struct grid_s {
|
||||||
|
Device device;
|
||||||
|
};
|
||||||
|
|
||||||
|
typedef grid_s* Grid;
|
||||||
|
|
||||||
|
AcResult
|
||||||
|
acGridInit(void)
|
||||||
|
{
|
||||||
|
MPI_Init(NULL, NULL);
|
||||||
|
|
||||||
|
int nprocs, pid;
|
||||||
|
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
||||||
|
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
|
||||||
|
|
||||||
|
char processor_name[MPI_MAX_PROCESSOR_NAME];
|
||||||
|
int name_len;
|
||||||
|
MPI_Get_processor_name(processor_name, &name_len);
|
||||||
|
printf("Processor %s. Process %d of %d.\n", processor_name, pid, nprocs);
|
||||||
|
}
|
||||||
|
|
||||||
|
AcResult
|
||||||
|
acGridLoad(const AcMesh mesh, Grid* grid)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
AcResult
|
||||||
|
acGridStore(const Grid grid, AcMesh* mesh)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
AcResult
|
||||||
|
acGridQuit(AcGrid& grid)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
Reference in New Issue
Block a user