More concurrent kernels and MPI comm

This commit is contained in:
Johannes Pekkila
2019-10-23 12:07:23 +02:00
parent 04867334e7
commit 1d81333ff7

View File

@@ -1995,14 +1995,18 @@ acDeviceBoundcondStepMPI_single_step_best(const Device device, AcMesh* submesh)
// BEST USE THIS! (full integration step) // BEST USE THIS! (full integration step)
static AcResult static AcResult
acDeviceBoundcondStepMPI(const Device device, AcMesh* submesh) acDeviceIntegrateStepMPI(const Device device, const AcReal dt, AcMesh* submesh)
{ {
const int mx = device->local_config.int_params[AC_mx]; const int mx = device->local_config.int_params[AC_mx];
const int my = device->local_config.int_params[AC_my]; const int my = device->local_config.int_params[AC_my];
const int mz = device->local_config.int_params[AC_mz]; const int mz = device->local_config.int_params[AC_mz];
const int nx = device->local_config.int_params[AC_nx];
const int ny = device->local_config.int_params[AC_ny];
const int nz = device->local_config.int_params[AC_nz];
const size_t count = mx * my * NGHOST; const size_t count = mx * my * NGHOST;
for (int isubstep = 0; isubtep < 3; ++isubstep) { for (int isubstep = 0; isubstep < 3; ++isubstep) {
acDeviceSynchronizeStream(device, STREAM_ALL);
// Local boundconds // Local boundconds
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
// Front plate local // Front plate local
@@ -2018,21 +2022,15 @@ acDeviceBoundcondStepMPI(const Device device, AcMesh* submesh)
acDevicePeriodicBoundcondStep(device, (Stream)i, (VertexBufferHandle)i, start, end); acDevicePeriodicBoundcondStep(device, (Stream)i, (VertexBufferHandle)i, start, end);
} }
} }
#define INNER_BOUNDCOND_STREAM ((Stream)(NUM_STREAMS - 1))
// Inner boundconds (while waiting) // Inner boundconds (while waiting)
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
const int3 start = (int3){0, 0, 2 * NGHOST}; const int3 start = (int3){0, 0, 2 * NGHOST};
const int3 end = (int3){mx, my, mz - 2 * NGHOST}; const int3 end = (int3){mx, my, mz - 2 * NGHOST};
acDevicePeriodicBoundcondStep(device, (Stream)(NUM_STREAMS - 1), (VertexBufferHandle)i, acDevicePeriodicBoundcondStep(device, INNER_BOUNDCOND_STREAM, (VertexBufferHandle)i,
start, end); start, end);
} }
// Inner integration
{
ERRCHK(NUM_STREAMS - 2 >= 0);
const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
const int3 m2 = node->subgrid.n;
acDeviceIntegrateSubstep(devices, (Stream)(NUM_STREAMS - 2), isubstep, m1, m2, dt);
}
// MPI // MPI
MPI_Request recv_requests[2 * NUM_VTXBUF_HANDLES]; MPI_Request recv_requests[2 * NUM_VTXBUF_HANDLES];
@@ -2088,51 +2086,60 @@ acDeviceBoundcondStepMPI(const Device device, AcMesh* submesh)
i + NUM_VTXBUF_HANDLES, MPI_COMM_WORLD, &request); i + NUM_VTXBUF_HANDLES, MPI_COMM_WORLD, &request);
} }
} }
// Inner integration
{
ERRCHK(NUM_STREAMS - 2 >= 0);
const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
const int3 m2 = (int3){mx, my, mz} - m1;
acDeviceIntegrateSubstep(device, (Stream)(NUM_STREAMS - 2), isubstep, m1, m2, dt);
}
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
MPI_Status status; MPI_Status status;
MPI_Wait(&recv_requests[i], &status); MPI_Wait(&recv_requests[i], &status);
MPI_Wait(&recv_requests[i + NUM_VTXBUF_HANDLES], &status); MPI_Wait(&recv_requests[i + NUM_VTXBUF_HANDLES], &status);
} }
acDeviceSynchronizeStream(device, INNER_BOUNDCOND_STREAM);
// #pragma omp parallel for // #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) { // Front { // Front
const int3 m1 = (int3){NGHOST, NGHOST, NGHOST}; const int3 m1 = (int3){NGHOST, NGHOST, NGHOST};
const int3 m2 = m1 + (int3){node->subgrid.n.x, node->subgrid.n.y, NGHOST}; const int3 m2 = m1 + (int3){nx, ny, NGHOST};
acDeviceIntegrateSubstep(node->devices[i], STREAM_0, isubstep, m1, m2, dt); acDeviceIntegrateSubstep(device, STREAM_0, isubstep, m1, m2, dt);
} }
// #pragma omp parallel for // #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) { // Back { // Back
const int3 m1 = (int3){NGHOST, NGHOST, node->subgrid.n.z}; const int3 m1 = (int3){NGHOST, NGHOST, nz};
const int3 m2 = m1 + (int3){node->subgrid.n.x, node->subgrid.n.y, NGHOST}; const int3 m2 = m1 + (int3){nx, ny, NGHOST};
acDeviceIntegrateSubstep(node->devices[i], STREAM_1, isubstep, m1, m2, dt); acDeviceIntegrateSubstep(device, STREAM_1, isubstep, m1, m2, dt);
} }
// #pragma omp parallel for // #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) { // Bottom { // Bottom
const int3 m1 = (int3){NGHOST, NGHOST, 2 * NGHOST}; const int3 m1 = (int3){NGHOST, NGHOST, 2 * NGHOST};
const int3 m2 = m1 + (int3){node->subgrid.n.x, NGHOST, node->subgrid.n.z - 2 * NGHOST}; const int3 m2 = m1 + (int3){nx, NGHOST, nz - 2 * NGHOST};
acDeviceIntegrateSubstep(node->devices[i], STREAM_2, isubstep, m1, m2, dt); acDeviceIntegrateSubstep(device, STREAM_2, isubstep, m1, m2, dt);
} }
// #pragma omp parallel for // #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) { // Top { // Top
const int3 m1 = (int3){NGHOST, node->subgrid.n.y, 2 * NGHOST}; const int3 m1 = (int3){NGHOST, ny, 2 * NGHOST};
const int3 m2 = m1 + (int3){node->subgrid.n.x, NGHOST, node->subgrid.n.z - 2 * NGHOST}; const int3 m2 = m1 + (int3){nx, NGHOST, nz - 2 * NGHOST};
acDeviceIntegrateSubstep(node->devices[i], STREAM_3, isubstep, m1, m2, dt); acDeviceIntegrateSubstep(device, STREAM_3, isubstep, m1, m2, dt);
} }
// #pragma omp parallel for // #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) { // Left { // Left
const int3 m1 = (int3){NGHOST, 2 * NGHOST, 2 * NGHOST}; const int3 m1 = (int3){NGHOST, 2 * NGHOST, 2 * NGHOST};
const int3 m2 = m1 + (int3){NGHOST, node->subgrid.n.y - 2 * NGHOST, const int3 m2 = m1 + (int3){NGHOST, ny - 2 * NGHOST, nz - 2 * NGHOST};
node->subgrid.n.z - 2 * NGHOST}; acDeviceIntegrateSubstep(device, STREAM_4, isubstep, m1, m2, dt);
acDeviceIntegrateSubstep(node->devices[i], STREAM_4, isubstep, m1, m2, dt);
} }
// #pragma omp parallel for // #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) { // Right { // Right
const int3 m1 = (int3){node->subgrid.n.x, 2 * NGHOST, 2 * NGHOST}; const int3 m1 = (int3){nx, 2 * NGHOST, 2 * NGHOST};
const int3 m2 = m1 + (int3){NGHOST, node->subgrid.n.y - 2 * NGHOST, const int3 m2 = m1 + (int3){NGHOST, ny - 2 * NGHOST, nz - 2 * NGHOST};
node->subgrid.n.z - 2 * NGHOST}; acDeviceIntegrateSubstep(device, STREAM_5, isubstep, m1, m2, dt);
acDeviceIntegrateSubstep(node->devices[i], STREAM_5, isubstep, m1, m2, dt);
} }
acNodeSwapBuffers(node); MPI_Barrier(MPI_COMM_WORLD);
acDeviceSwapBuffers(device);
MPI_Barrier(MPI_COMM_WORLD);
} }
return AC_SUCCESS; return AC_SUCCESS;
@@ -2313,10 +2320,12 @@ acDeviceRunMPITest(void)
AcMeshInfo info; AcMeshInfo info;
acLoadConfig(AC_DEFAULT_CONFIG, &info); acLoadConfig(AC_DEFAULT_CONFIG, &info);
// Rewind here!
const int nn = 512; const int nn = 512;
info.int_params[AC_nx] = info.int_params[AC_ny] = info.int_params[AC_nz] = nn; info.int_params[AC_nx] = info.int_params[AC_ny] = info.int_params[AC_nz] = nn;
acUpdateConfig(&info); acUpdateConfig(&info);
/*
AcMesh model, candidate; AcMesh model, candidate;
// Master CPU // Master CPU
@@ -2326,7 +2335,7 @@ acDeviceRunMPITest(void)
acMeshRandomize(&model); acMeshRandomize(&model);
acMeshApplyPeriodicBounds(&model); acMeshApplyPeriodicBounds(&model);
} }*/
assert(info.int_params[AC_nz] % num_processes == 0); assert(info.int_params[AC_nz] % num_processes == 0);
@@ -2345,8 +2354,9 @@ acDeviceRunMPITest(void)
// Create submesh // Create submesh
AcMesh submesh; AcMesh submesh;
acMeshCreate(submesh_info, &submesh); acMeshCreate(submesh_info, &submesh);
acMeshRandomize(&submesh);
acDeviceDistributeMeshMPI(model, &submesh); // acDeviceDistributeMeshMPI(model, &submesh);
//////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////
Device device; Device device;
@@ -2355,16 +2365,18 @@ acDeviceRunMPITest(void)
// Warmup // Warmup
acDeviceSynchronizeStream(device, STREAM_ALL); acDeviceSynchronizeStream(device, STREAM_ALL);
for (int i = 0; i < 10; ++i) { for (int i = 0; i < 2; ++i) {
acDeviceBoundcondStepMPI(device, &submesh); // acDeviceBoundcondStepMPI(device, &submesh);
acDeviceIntegrateStepMPI(device, FLT_EPSILON, &submesh);
} }
// Benchmark // Benchmark
const int num_iters = 100; const int num_iters = 10;
Timer total_time; Timer total_time;
timer_reset(&total_time); timer_reset(&total_time);
for (int i = 0; i < num_iters; ++i) { for (int i = 0; i < num_iters; ++i) {
acDeviceBoundcondStepMPI(device, &submesh); // acDeviceBoundcondStepMPI(device, &submesh);
acDeviceIntegrateStepMPI(device, FLT_EPSILON, &submesh);
} }
if (pid == 0) { if (pid == 0) {
const double ms_elapsed = timer_diff_nsec(total_time) / 1e6; const double ms_elapsed = timer_diff_nsec(total_time) / 1e6;
@@ -2377,16 +2389,18 @@ acDeviceRunMPITest(void)
acDeviceStoreMesh(device, STREAM_DEFAULT, &submesh); acDeviceStoreMesh(device, STREAM_DEFAULT, &submesh);
acDeviceDestroy(device); acDeviceDestroy(device);
//////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////
acDeviceGatherMeshMPI(submesh, &candidate); // acDeviceGatherMeshMPI(submesh, &candidate);
acMeshDestroy(&submesh); acMeshDestroy(&submesh);
/*
// Master CPU // Master CPU
if (pid == 0) { if (pid == 0)
{
acVerifyMesh(model, candidate); acVerifyMesh(model, candidate);
acMeshDestroy(&model); acMeshDestroy(&model);
acMeshDestroy(&candidate); acMeshDestroy(&candidate);
} }
*/
MPI_Finalize(); MPI_Finalize();
return AC_FAILURE; return AC_FAILURE;