diff --git a/src/core/device.cc b/src/core/device.cc index 9ee2c1a..e594a88 100644 --- a/src/core/device.cc +++ b/src/core/device.cc @@ -33,6 +33,10 @@ #include "kernels/common.cuh" +#if AC_MPI_ENABLED +#include "kernels/packing.cuh" +#endif + #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0])) struct device_s { @@ -48,7 +52,9 @@ struct device_s { AcReal* reduce_result; #if AC_MPI_ENABLED - // TODO +#define NUM_CORNERS (8) + PackedData corners_send[NUM_CORNERS]; + PackedData corners_recv[NUM_CORNERS]; #endif }; @@ -80,6 +86,29 @@ isWithin(const int3 idx, const int3 min, const int3 max) return false; } +static PackedData +acCreatePackedData(const int3 dims) +{ + PackedData data = {0}; + + data.dims = dims; + + const size_t bytes = dims.x * dims.y * dims.z * sizeof(data.data[0]) * NUM_VTXBUF_HANDLES; + ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&data.data, bytes)); + + return data; +} + +static AcResult +acDestroyPackedData(PackedData* data) +{ + data->dims = (int3){-1, -1, -1}; + cudaFree(data->data); + data->data = NULL; + + return AC_SUCCESS; +} + AcResult acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_handle) { @@ -130,6 +159,11 @@ acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_hand #if AC_MPI_ENABLED // TODO + const int3 dims = (int3){NGHOST, NGHOST, NGHOST}; + for (int i = 0; i < NUM_CORNERS; ++i) { + device->corners_send[i] = acCreatePackedData(dims); + device->corners_recv[i] = acCreatePackedData(dims); + } #endif // Device constants @@ -166,7 +200,10 @@ acDeviceDestroy(Device device) cudaFree(device->reduce_result); #if AC_MPI_ENABLED - // TODO + for (int i = 0; i < NUM_CORNERS; ++i) { + acDestroyPackedData(&device->corners_send[i]); + acDestroyPackedData(&device->corners_recv[i]); + } #endif // Concurrency @@ -741,6 +778,7 @@ acDeviceGatherMeshMPI(const AcMesh src, AcMesh* dst) #endif */ +#if 0 // Not using 1D decomp // 1D decomp static AcResult acDeviceBoundStepMPI(const Device device) @@ -974,6 +1012,7 @@ acDeviceIntegrateStepMPI(const Device device, const AcReal dt) return AC_SUCCESS; } +#endif static int3 decompose(const int target) @@ -1121,21 +1160,461 @@ acDeviceGatherMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst) } } -#include "kernels/packing.cuh" - +/* static AcResult acDeviceCommunicateHalosMPI(const Device device) { acDeviceSynchronizeStream(device, STREAM_ALL); + MPI_Datatype datatype = MPI_FLOAT; + if (sizeof(AcReal) == 8) + datatype = MPI_DOUBLE; + int nprocs, pid; MPI_Comm_size(MPI_COMM_WORLD, &nprocs); MPI_Comm_rank(MPI_COMM_WORLD, &pid); const int3 decomp = decompose(nprocs); + const int3 nn = (int3){ + device->local_config.int_params[AC_nx], + device->local_config.int_params[AC_ny], + device->local_config.int_params[AC_nz], + }; + + // Pack data + const int3 a0s[] = { + (int3){NGHOST, NGHOST, NGHOST}, // + (int3){nn.x, NGHOST, NGHOST}, // + (int3){NGHOST, nn.y, NGHOST}, // + (int3){nn.x, nn.y, NGHOST}, // + (int3){NGHOST, NGHOST, nn.z}, // + (int3){nn.x, NGHOST, nn.z}, // + (int3){NGHOST, nn.y, nn.z}, // + (int3){nn.x, nn.y, nn.z}, + }; + const int3 b0s[] = { + (int3){0, 0, 0}, + (int3){NGHOST + nn.x, 0, 0}, + (int3){0, NGHOST + nn.y, 0}, + (int3){NGHOST + nn.x, NGHOST + nn.y, 0}, + + (int3){0, 0, NGHOST + nn.z}, + (int3){NGHOST + nn.x, 0, NGHOST + nn.z}, + (int3){0, NGHOST + nn.y, NGHOST + nn.z}, + (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z}, + }; + + const cudaStream_t stream = device->streams[STREAM_DEFAULT]; + for (int i = 0; i < ARRAY_SIZE(a0s); ++i) + acKernelPackData(stream, device->vba, a0s[i], device->corners_send[i]); + + MPI_Barrier(MPI_COMM_WORLD); + acDeviceSynchronizeStream(device, STREAM_ALL); // TODO debug remove + MPI_Barrier(MPI_COMM_WORLD); + + for (int k = -1; k <= 1; ++k) { + for (int j = -1; j <= 1; ++j) { + for (int i = -1; i <= 1; ++i) { + if (i == 0 && j == 0 && k == 0) + continue; + + const int3 neighbor = (int3){i, j, k}; + + for (int a_idx = 0; a_idx < ARRAY_SIZE(a0s); ++a_idx) { + const int3 a0 = a0s[a_idx]; + const int3 a1 = a0 + device->corners_send[a_idx].dims; + + const int3 nmin = (int3){NGHOST, NGHOST, NGHOST}; + const int3 nmax = nmin + nn + (int3){1, 1, 1}; + ERRCHK_ALWAYS(isWithin(a1, nmin, nmax)); + + const int3 b0 = a0 - neighbor * nn; + const int3 b1 = a1 - neighbor * nn; + + const int3 mmin = (int3){0, 0, 0}; + const int3 mmax = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST} + nn; + if (isWithin(b0, mmin, mmax) && isWithin(b1, mmin, mmax + (int3){1, 1, 1})) { + printf("neighbor "); + print_int3(neighbor); + printf("\n"); + printf("\tb0: "); + print_int3(b0); + printf("\n"); + + for (int b_idx = 0; b_idx < ARRAY_SIZE(b0s); ++b_idx) { + if (b0s[b_idx].x == b0.x && b0s[b_idx].y == b0.y && + b0s[b_idx].z == b0.z) { + + ERRCHK_ALWAYS(a_idx < NUM_CORNERS); + ERRCHK_ALWAYS(b_idx < NUM_CORNERS); + + const int3 pid3d = getPid3D(pid, decomp); + const size_t count = device->corners_send[a_idx].dims.x * + device->corners_send[a_idx].dims.y * + device->corners_send[a_idx].dims.z * + NUM_VTXBUF_HANDLES; + + MPI_Request send_req, recv_req; + MPI_Isend((device->corners_send[a_idx]).data, count, datatype, + getPid(pid3d + neighbor, decomp), b_idx, MPI_COMM_WORLD, + &send_req); + + MPI_Irecv((device->corners_recv[b_idx]).data, count, datatype, + getPid(pid3d - neighbor, decomp), b_idx, MPI_COMM_WORLD, + &recv_req); + + MPI_Status status; + MPI_Wait(&recv_req, &status); + + break; + } + } + } + } + } + } + } + printf("------------------\n"); + + MPI_Barrier(MPI_COMM_WORLD); + acDeviceSynchronizeStream(device, STREAM_ALL); + // Unpack data + for (int i = 0; i < ARRAY_SIZE(b0s); ++i) { + acKernelUnpackData(stream, device->corners_recv[i], b0s[i], device->vba); + } + + return AC_SUCCESS; +}*/ + +static AcResult +acDeviceCommunicateCornersMPI(const Device device) +{ + cudaSetDevice(device->id); + acDeviceSynchronizeStream(device, STREAM_ALL); + MPI_Barrier(MPI_COMM_WORLD); + + MPI_Datatype datatype = MPI_FLOAT; + if (sizeof(AcReal) == 8) + datatype = MPI_DOUBLE; + + int nprocs, pid; + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + MPI_Comm_rank(MPI_COMM_WORLD, &pid); + const int3 decomp = decompose(nprocs); + + const int3 nn = (int3){ + device->local_config.int_params[AC_nx], + device->local_config.int_params[AC_ny], + device->local_config.int_params[AC_nz], + }; + + // Pack data + const int3 a0s[] = { + (int3){NGHOST, NGHOST, NGHOST}, // + (int3){nn.x, NGHOST, NGHOST}, // + (int3){NGHOST, nn.y, NGHOST}, // + (int3){nn.x, nn.y, NGHOST}, // + + (int3){NGHOST, NGHOST, nn.z}, // + (int3){nn.x, NGHOST, nn.z}, // + (int3){NGHOST, nn.y, nn.z}, // + (int3){nn.x, nn.y, nn.z}, + }; + const int3 b0s[] = { + (int3){0, 0, 0}, + (int3){NGHOST + nn.x, 0, 0}, + (int3){0, NGHOST + nn.y, 0}, + (int3){NGHOST + nn.x, NGHOST + nn.y, 0}, + + (int3){0, 0, NGHOST + nn.z}, + (int3){NGHOST + nn.x, 0, NGHOST + nn.z}, + (int3){0, NGHOST + nn.y, NGHOST + nn.z}, + (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z}, + }; + + const int3 dims = (int3){NGHOST, NGHOST, NGHOST}; + + for (int k = -1; k <= 1; ++k) { + for (int j = -1; j <= 1; ++j) { + for (int i = -1; i <= 1; ++i) { + if (i == 0 && j == 0 && k == 0) + continue; + + for (int a_idx = 0; a_idx < ARRAY_SIZE(a0s); ++a_idx) { + for (int b_idx = 0; b_idx < ARRAY_SIZE(b0s); ++b_idx) { + const int3 neighbor = (int3){i, j, k}; + + const int3 a0 = a0s[a_idx]; + const int3 a1 = a0 + dims; + + const int3 b0 = a0 - neighbor * nn; + const int3 b1 = a1 - neighbor * nn; + + if (b0s[b_idx].x == b0.x && b0s[b_idx].y == b0.y && b0s[b_idx].z == b0.z) { + + const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES; + + PackedData src = acCreatePackedData(dims); + PackedData dst = acCreatePackedData(dims); + + const cudaStream_t stream = device->streams[STREAM_DEFAULT]; + acKernelPackData(stream, device->vba, a0, src); + acDeviceSynchronizeStream(device, STREAM_DEFAULT); + + const int3 pid3d = getPid3D(pid, decomp); + MPI_Request send_req, recv_req; + MPI_Isend(src.data, count, datatype, getPid(pid3d + neighbor, decomp), + b_idx, MPI_COMM_WORLD, &send_req); + MPI_Irecv(dst.data, count, datatype, getPid(pid3d - neighbor, decomp), + b_idx, MPI_COMM_WORLD, &recv_req); + + MPI_Status status; + MPI_Wait(&recv_req, &status); + + acKernelUnpackData(stream, dst, b0, device->vba); + acDeviceSynchronizeStream(device, STREAM_DEFAULT); + + acDestroyPackedData(&src); + acDestroyPackedData(&dst); + } + } + } + } + } + } + return AC_SUCCESS; } +static AcResult +acDeviceCommunicateEdgesMPI(const Device device) +{ + cudaSetDevice(device->id); + acDeviceSynchronizeStream(device, STREAM_ALL); + MPI_Barrier(MPI_COMM_WORLD); + + MPI_Datatype datatype = MPI_FLOAT; + if (sizeof(AcReal) == 8) + datatype = MPI_DOUBLE; + + int nprocs, pid; + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + MPI_Comm_rank(MPI_COMM_WORLD, &pid); + const int3 decomp = decompose(nprocs); + + const int3 nn = (int3){ + device->local_config.int_params[AC_nx], + device->local_config.int_params[AC_ny], + device->local_config.int_params[AC_nz], + }; + + // X-axis (TODO Y and Z) + // Pack data + const int3 a0s[] = { + (int3){NGHOST, NGHOST, NGHOST}, // + (int3){NGHOST, nn.y, NGHOST}, // + + (int3){NGHOST, NGHOST, nn.z}, // + (int3){NGHOST, nn.y, nn.z}, // + }; + const int3 b0s[] = { + (int3){NGHOST, 0, 0}, + (int3){NGHOST, NGHOST + nn.y, 0}, + + (int3){NGHOST, 0, NGHOST + nn.z}, + (int3){NGHOST, NGHOST + nn.y, NGHOST + nn.z}, + }; + + const int3 dims = (int3){nn.x, NGHOST, NGHOST}; + + for (int k = -1; k <= 1; ++k) { + for (int j = -1; j <= 1; ++j) { + for (int i = -1; i <= 1; ++i) { + if (i == 0 && j == 0 && k == 0) + continue; + + for (int a_idx = 0; a_idx < ARRAY_SIZE(a0s); ++a_idx) { + for (int b_idx = 0; b_idx < ARRAY_SIZE(b0s); ++b_idx) { + const int3 neighbor = (int3){i, j, k}; + + const int3 a0 = a0s[a_idx]; + const int3 a1 = a0 + dims; + + const int3 b0 = a0 - neighbor * nn; + const int3 b1 = a1 - neighbor * nn; + + if (b0s[b_idx].x == b0.x && b0s[b_idx].y == b0.y && b0s[b_idx].z == b0.z) { + printf("Transfer: "); + print_int3(a0); + printf(" -> "); + print_int3(b0); + printf("\n"); + const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES; + + PackedData src = acCreatePackedData(dims); + PackedData dst = acCreatePackedData(dims); + + const cudaStream_t stream = device->streams[STREAM_DEFAULT]; + acKernelPackData(stream, device->vba, a0, src); + acDeviceSynchronizeStream(device, STREAM_DEFAULT); + + const int3 pid3d = getPid3D(pid, decomp); + MPI_Request send_req, recv_req; + MPI_Isend(src.data, count, datatype, getPid(pid3d + neighbor, decomp), + b_idx, MPI_COMM_WORLD, &send_req); + MPI_Irecv(dst.data, count, datatype, getPid(pid3d - neighbor, decomp), + b_idx, MPI_COMM_WORLD, &recv_req); + + MPI_Wait(&send_req, MPI_STATUS_IGNORE); + MPI_Wait(&recv_req, MPI_STATUS_IGNORE); + + acKernelUnpackData(stream, dst, b0, device->vba); + acDeviceSynchronizeStream(device, STREAM_DEFAULT); + + acDestroyPackedData(&src); + acDestroyPackedData(&dst); + } + } + } + } + } + } + + return AC_SUCCESS; +} + +static AcResult +acDeviceCommunicateHalosMPI(const Device device) +{ + acDeviceCommunicateCornersMPI(device); + acDeviceCommunicateEdgesMPI(device); + return AC_SUCCESS; +} +/* +static AcResult +acDeviceCommunicateHalosMPI(const Device device) +{ + acDeviceSynchronizeStream(device, STREAM_ALL); + MPI_Barrier(MPI_COMM_WORLD); + + MPI_Datatype datatype = MPI_FLOAT; + if (sizeof(AcReal) == 8) + datatype = MPI_DOUBLE; + + int nprocs, pid; + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + MPI_Comm_rank(MPI_COMM_WORLD, &pid); + const int3 decomp = decompose(nprocs); + + const int3 nn = (int3){ + device->local_config.int_params[AC_nx], + device->local_config.int_params[AC_ny], + device->local_config.int_params[AC_nz], + }; + + // Pack data + const int3 a0s[] = { + (int3){NGHOST, NGHOST, NGHOST}, // + (int3){nn.x, NGHOST, NGHOST}, // + (int3){NGHOST, nn.y, NGHOST}, // + (int3){nn.x, nn.y, NGHOST}, // + (int3){NGHOST, NGHOST, nn.z}, // + (int3){nn.x, NGHOST, nn.z}, // + (int3){NGHOST, nn.y, nn.z}, // + (int3){nn.x, nn.y, nn.z}, + }; + const int3 b0s[] = { + (int3){0, 0, 0}, + (int3){NGHOST + nn.x, 0, 0}, + (int3){0, NGHOST + nn.y, 0}, + (int3){NGHOST + nn.x, NGHOST + nn.y, 0}, + + (int3){0, 0, NGHOST + nn.z}, + (int3){NGHOST + nn.x, 0, NGHOST + nn.z}, + (int3){0, NGHOST + nn.y, NGHOST + nn.z}, + (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z}, + }; + + for (int k = -1; k <= 1; ++k) { + for (int j = -1; j <= 1; ++j) { + for (int i = -1; i <= 1; ++i) { + if (i == 0 && j == 0 && k == 0) + continue; + + const int3 neighbor = (int3){i, j, k}; + + for (int a_idx = 0; a_idx < ARRAY_SIZE(a0s); ++a_idx) { + const int3 a0 = a0s[a_idx]; + const int3 a1 = a0 + device->corners_send[a_idx].dims; + + const int3 nmin = (int3){NGHOST, NGHOST, NGHOST}; + const int3 nmax = nmin + nn + (int3){1, 1, 1}; + ERRCHK_ALWAYS(isWithin(a1, nmin, nmax)); + + const int3 b0 = a0 - neighbor * nn; + const int3 b1 = a1 - neighbor * nn; + + const int3 mmin = (int3){0, 0, 0}; + const int3 mmax = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST} + nn; + if (isWithin(b0, mmin, mmax) && isWithin(b1, mmin, mmax + (int3){1, 1, 1})) { + printf("neighbor "); + print_int3(neighbor); + printf("\n"); + printf("\tb0: "); + print_int3(b0); + printf("\n"); + + for (int b_idx = 0; b_idx < ARRAY_SIZE(b0s); ++b_idx) { + if (b0s[b_idx].x == b0.x && b0s[b_idx].y == b0.y && + b0s[b_idx].z == b0.z) { + + ERRCHK_ALWAYS(a_idx < NUM_CORNERS); + ERRCHK_ALWAYS(b_idx < NUM_CORNERS); + const int3 pid3d = getPid3D(pid, decomp); + + const int3 dims = device->corners_send[a_idx].dims; + const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES; + + const cudaStream_t stream = device->streams[STREAM_DEFAULT]; + + AcReal *src, *dst; + ERRCHK_CUDA_ALWAYS( + cudaMalloc((void**)&src, count * sizeof(src[0]))); + ERRCHK_CUDA_ALWAYS( + cudaMalloc((void**)&dst, count * sizeof(dst[0]))); + + PackedData srcdata = {dims, src}; + acKernelPackData(stream, device->vba, a0, srcdata); + acDeviceSynchronizeStream(device, STREAM_ALL); + + MPI_Request send_req, recv_req; + MPI_Isend(src, count, datatype, getPid(pid3d + neighbor, decomp), + b_idx, MPI_COMM_WORLD, &send_req); + MPI_Irecv(dst, count, datatype, getPid(pid3d - neighbor, decomp), + b_idx, MPI_COMM_WORLD, &recv_req); + + MPI_Status status; + MPI_Wait(&recv_req, &status); + + PackedData dstdata = {dims, dst}; + acKernelUnpackData(stream, dstdata, b0, device->vba); + acDeviceSynchronizeStream(device, STREAM_ALL); + cudaFree(src); + cudaFree(dst); + + printf("Sent!\n"); + break; + } + } + } + } + } + } + } + printf("------------------\n"); + return AC_SUCCESS; +}*/ + // From Astaroth Utils #include "src/utils/config_loader.h" #include "src/utils/memory.h" @@ -1331,15 +1810,14 @@ acDeviceRunMPITest(void) printf("Time per step: %f ms\n", ms_elapsed / num_iters); const size_t nth_index = int(nth_percentile * num_iters); - printf("%dth percentile per step: %f ms\n", int(100 * nth_percentile), results[nth_index]); + printf("%dth percentile per step: %f ms\n", int(100 * nth_percentile), + results[nth_index]); // Write out char buf[256]; - sprintf(buf, "nprocs_%d_result_%s.bench", nprocs, BENCH_STRONG_SCALING ? "strong" : "weak"); - FILE* fp = fopen(buf, "w"); - ERRCHK_ALWAYS(fp); - fprintf(fp, "nprocs, percentile (%dth)\n", int(100 * nth_percentile)); - fprintf(fp, "%d, %g\n", nprocs, results[nth_index]); + sprintf(buf, "nprocs_%d_result_%s.bench", nprocs, BENCH_STRONG_SCALING ? "strong" : + "weak"); FILE* fp = fopen(buf, "w"); ERRCHK_ALWAYS(fp); fprintf(fp, "nprocs, percentile + (%dth)\n", int(100 * nth_percentile)); fprintf(fp, "%d, %g\n", nprocs, results[nth_index]); fclose(fp); } // Benchmark end /////////////////////////////////////////////////////////////////////// @@ -1385,9 +1863,8 @@ acDeviceRunMPITest(void) info.real_params[AC_inv_dsx] = AcReal(1.0) / info.real_params[AC_dsx]; info.real_params[AC_inv_dsy] = AcReal(1.0) / info.real_params[AC_dsy]; info.real_params[AC_inv_dsz] = AcReal(1.0) / info.real_params[AC_dsz]; - info.real_params[AC_cs2_sound] = info.real_params[AC_cs_sound] * info.real_params[AC_cs_sound]; - acUpdateConfig(&info); - acPrintMeshInfo(info); + info.real_params[AC_cs2_sound] = info.real_params[AC_cs_sound] * +info.real_params[AC_cs_sound]; acUpdateConfig(&info); acPrintMeshInfo(info); ERRCHK_ALWAYS(info.int_params[AC_nz] % num_processes == 0); #if BENCH_STRONG_SCALING @@ -1509,7 +1986,8 @@ acDeviceRunMPITest(void) printf("Time per step: %f ms\n", ms_elapsed / num_iters); const size_t nth_index = int(nth_percentile * num_iters); - printf("%dth percentile per step: %f ms\n", int(100 * nth_percentile), results[nth_index]); + printf("%dth percentile per step: %f ms\n", int(100 * nth_percentile), +results[nth_index]); // Write out char buf[256];