diff --git a/src/core/device.cc b/src/core/device.cc index 11c6f3e..2532640 100644 --- a/src/core/device.cc +++ b/src/core/device.cc @@ -33,10 +33,6 @@ #include "kernels/common.cuh" -#if AC_MPI_ENABLED -#include "kernels/packing.cuh" -#endif - #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0])) struct device_s { @@ -50,105 +46,16 @@ struct device_s { VertexBufferArray vba; AcReal* reduce_scratchpad; AcReal* reduce_result; - -#if AC_MPI_ENABLED -#define NUM_CORNERS (8) - PackedData corners_send[NUM_CORNERS]; - PackedData corners_recv[NUM_CORNERS]; -#endif }; #include "kernels/boundconds.cuh" #include "kernels/integration.cuh" #include "kernels/reductions.cuh" -#if PACKED_DATA_TRANSFERS // Defined in device.cuh +#if PACKED_DATA_TRANSFERS // TODO DEPRECATED, see AC_MPI_ENABLED instead // #include "kernels/pack_unpack.cuh" #endif -static void -print_int3(const int3 val) -{ - printf("(%d, %d, %d)", val.x, val.y, val.z); -} - -static bool -isWithin(const int3 idx, const int3 min, const int3 max) -{ - if (idx.x < max.x && // - idx.y < max.y && // - idx.z < max.z && // - idx.x >= min.x && // - idx.y >= min.y && // - idx.z >= min.z) - return true; - else - return false; -} - -static PackedData -acCreatePackedData(const int3 dims) -{ - PackedData data = {}; - - data.dims = dims; - - const size_t bytes = dims.x * dims.y * dims.z * sizeof(data.data[0]) * NUM_VTXBUF_HANDLES; - ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&data.data, bytes)); - - return data; -} - -static AcResult -acDestroyPackedData(PackedData* data) -{ - data->dims = (int3){-1, -1, -1}; - cudaFree(data->data); - data->data = NULL; - - return AC_SUCCESS; -} - -static PackedData -acCreatePackedDataHost(const int3 dims) -{ - PackedData data = {}; - - data.dims = dims; - - const size_t bytes = dims.x * dims.y * dims.z * sizeof(data.data[0]) * NUM_VTXBUF_HANDLES; - data.data = (AcReal*)malloc(bytes); - ERRCHK_ALWAYS(data.data); - - return data; -} - -static void -acTransferPackedDataToHost(const PackedData ddata, PackedData* hdata) -{ - const size_t bytes = ddata.dims.x * ddata.dims.y * ddata.dims.z * sizeof(ddata.data[0]) * - NUM_VTXBUF_HANDLES; - ERRCHK_CUDA_ALWAYS(cudaMemcpy(hdata->data, ddata.data, bytes, cudaMemcpyDeviceToHost)); -} - -static void -acTransferPackedDataToDevice(const PackedData hdata, PackedData* ddata) -{ - const size_t bytes = hdata.dims.x * hdata.dims.y * hdata.dims.z * sizeof(hdata.data[0]) * - NUM_VTXBUF_HANDLES; - ERRCHK_CUDA_ALWAYS(cudaMemcpy(ddata->data, hdata.data, bytes, cudaMemcpyHostToDevice)); -} - -static AcResult -acDestroyPackedDataHost(PackedData* data) -{ - data->dims = (int3){-1, -1, -1}; - free(data->data); - data->data = NULL; - - return AC_SUCCESS; -} - AcResult acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_handle) { @@ -197,15 +104,6 @@ acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_hand acVertexBufferCompdomainSizeBytes(device_config))); ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->reduce_result, sizeof(AcReal))); -#if AC_MPI_ENABLED - // TODO - const int3 dims = (int3){NGHOST, NGHOST, NGHOST}; - for (int i = 0; i < NUM_CORNERS; ++i) { - device->corners_send[i] = acCreatePackedData(dims); - device->corners_recv[i] = acCreatePackedData(dims); - } -#endif - // Device constants acDeviceLoadMeshInfo(device, STREAM_DEFAULT, device_config); @@ -239,13 +137,6 @@ acDeviceDestroy(Device device) cudaFree(device->reduce_scratchpad); cudaFree(device->reduce_result); -#if AC_MPI_ENABLED - for (int i = 0; i < NUM_CORNERS; ++i) { - acDestroyPackedData(&device->corners_send[i]); - acDestroyPackedData(&device->corners_recv[i]); - } -#endif - // Concurrency for (int i = 0; i < NUM_STREAMS; ++i) { cudaStreamDestroy(device->streams[i]); @@ -687,2167 +578,6 @@ acDeviceReduceVec(const Device device, const Stream stream, const ReductionType return AC_SUCCESS; } -//////////////////////////////////////////////////////////////////////////////////////////////////// -// MPI tests -//////////////////////////////////////////////////////////////////////////////////////////////////// -#if AC_MPI_ENABLED == 1 -/** - Running: mpirun -np -*/ -#include - -#include - -static int -mod(const int a, const int b) -{ - const int r = a % b; - return r < 0 ? r + b : r; -} - -/* -static inline int -get_neighbor(const int3 offset) -{ - // The number of nodes is n^3 = m = num_processes - // Require that the problem size is always equivalent among processes ((floor(cbrt(m))^3 == m) - // Require that mesh dimension is (n 2^w), where w is some integer - - int pid, num_processes; - MPI_Comm_rank(MPI_COMM_WORLD, &pid); - MPI_Comm_size(MPI_COMM_WORLD, &num_processes); - - const int n = (int)cbrt((float)num_processes); - ERRCHK_ALWAYS((int)ceil(cbrt((float)num_processes)) == n); - ERRCHK_ALWAYS(n * n * n == num_processes); - - return mod(pid + offset.x, n) + offset.y * n + offset.z * n * n; -} -*/ - -/* -#ifdef DECOMP_1D -// Working 1D decomp distribute/gather -static void -acDeviceDistributeMeshMPI(const AcMesh src, AcMesh* dst) -{ - MPI_Barrier(MPI_COMM_WORLD); - printf("Distributing mesh...\n"); - - MPI_Datatype datatype = MPI_FLOAT; - if (sizeof(AcReal) == 8) - datatype = MPI_DOUBLE; - - int pid, num_processes; - MPI_Comm_rank(MPI_COMM_WORLD, &pid); - MPI_Comm_size(MPI_COMM_WORLD, &num_processes); - - const size_t count = acVertexBufferSize(dst->info); - for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { - - if (pid == 0) { - // Communicate to self - assert(dst); - memcpy(&dst->vertex_buffer[i][0], // - &src.vertex_buffer[i][0], // - count * sizeof(src.vertex_buffer[i][0])); - - // Communicate to others - for (int j = 1; j < num_processes; ++j) { - const size_t src_idx = acVertexBufferIdx( - 0, 0, j * src.info.int_params[AC_nz] / num_processes, src.info); - - MPI_Send(&src.vertex_buffer[i][src_idx], count, datatype, j, 0, MPI_COMM_WORLD); - } - } - else { - assert(dst); - - // Recv - const size_t dst_idx = 0; - MPI_Status status; - MPI_Recv(&dst->vertex_buffer[i][dst_idx], count, datatype, 0, 0, MPI_COMM_WORLD, - &status); - } - } -} - -static void -acDeviceGatherMeshMPI(const AcMesh src, AcMesh* dst) -{ - MPI_Barrier(MPI_COMM_WORLD); - printf("Gathering mesh...\n"); - MPI_Datatype datatype = MPI_FLOAT; - if (sizeof(AcReal) == 8) - datatype = MPI_DOUBLE; - - int pid, num_processes; - MPI_Comm_rank(MPI_COMM_WORLD, &pid); - MPI_Comm_size(MPI_COMM_WORLD, &num_processes); - - size_t count = acVertexBufferSize(src.info); - - for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { - // Communicate to self - if (pid == 0) { - assert(dst); - memcpy(&dst->vertex_buffer[i][0], // - &src.vertex_buffer[i][0], // - count * sizeof(src.vertex_buffer[i][0])); - - for (int j = 1; j < num_processes; ++j) { - // Recv - const size_t dst_idx = acVertexBufferIdx( - 0, 0, j * dst->info.int_params[AC_nz] / num_processes, dst->info); - - assert(dst_idx + count <= acVertexBufferSize(dst->info)); - MPI_Status status; - MPI_Recv(&dst->vertex_buffer[i][dst_idx], count, datatype, j, 0, MPI_COMM_WORLD, - &status); - } - } - else { - // Send - const size_t src_idx = 0; - - assert(src_idx + count <= acVertexBufferSize(src.info)); - MPI_Send(&src.vertex_buffer[i][src_idx], count, datatype, 0, 0, MPI_COMM_WORLD); - } - } -} -#endif -*/ - -#if 0 // Not using 1D decomp -// 1D decomp -static AcResult -acDeviceBoundStepMPI(const Device device) -{ - const int mx = device->local_config.int_params[AC_mx]; - const int my = device->local_config.int_params[AC_my]; - const int mz = device->local_config.int_params[AC_mz]; - const size_t count = mx * my * NGHOST; - - acDeviceSynchronizeStream(device, STREAM_ALL); - // Local boundconds - for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { - // Front plate local - { - const int3 start = (int3){0, 0, NGHOST}; - const int3 end = (int3){mx, my, 2 * NGHOST}; - acDevicePeriodicBoundcondStep(device, (Stream)i, (VertexBufferHandle)i, start, end); - } - // Back plate local - { - const int3 start = (int3){0, 0, mz - 2 * NGHOST}; - const int3 end = (int3){mx, my, mz - NGHOST}; - acDevicePeriodicBoundcondStep(device, (Stream)i, (VertexBufferHandle)i, start, end); - } - } -#define INNER_BOUNDCOND_STREAM ((Stream)(NUM_STREAMS - 1)) - // Inner boundconds (while waiting) - for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { - - const int3 start = (int3){0, 0, 2 * NGHOST}; - const int3 end = (int3){mx, my, mz - 2 * NGHOST}; - acDevicePeriodicBoundcondStep(device, INNER_BOUNDCOND_STREAM, (VertexBufferHandle)i, start, - end); - } - - // MPI - MPI_Request send_requests[2 * NUM_VTXBUF_HANDLES]; - MPI_Request recv_requests[2 * NUM_VTXBUF_HANDLES]; - MPI_Datatype datatype = MPI_FLOAT; - if (sizeof(AcReal) == 8) - datatype = MPI_DOUBLE; - - int pid, num_processes; - MPI_Comm_rank(MPI_COMM_WORLD, &pid); - MPI_Comm_size(MPI_COMM_WORLD, &num_processes); - - for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { - { // Recv neighbor's front - // ...|ooooxxx|... -> xxx|ooooooo|... - const size_t dst_idx = acVertexBufferIdx(0, 0, 0, device->local_config); - const int recv_pid = (pid + num_processes - 1) % num_processes; - - MPI_Irecv(&device->vba.in[i][dst_idx], count, datatype, recv_pid, i, MPI_COMM_WORLD, - &recv_requests[i]); - } - { // Recv neighbor's back - // ...|ooooooo|xxx <- ...|xxxoooo|... - const size_t dst_idx = acVertexBufferIdx(0, 0, mz - NGHOST, device->local_config); - const int recv_pid = (pid + 1) % num_processes; - - MPI_Irecv(&device->vba.in[i][dst_idx], count, datatype, recv_pid, - NUM_VTXBUF_HANDLES + i, MPI_COMM_WORLD, - &recv_requests[i + NUM_VTXBUF_HANDLES]); - } - } - - for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { - acDeviceSynchronizeStream(device, (Stream)i); - { - // Send front - // ...|ooooxxx|... -> xxx|ooooooo|... - const size_t src_idx = acVertexBufferIdx(0, 0, mz - 2 * NGHOST, device->local_config); - const int send_pid = (pid + 1) % num_processes; - - MPI_Isend(&device->vba.in[i][src_idx], count, datatype, send_pid, i, MPI_COMM_WORLD, - &send_requests[i]); - } - { // Send back - // ...|ooooooo|xxx <- ...|xxxoooo|... - const size_t src_idx = acVertexBufferIdx(0, 0, NGHOST, device->local_config); - const int send_pid = (pid + num_processes - 1) % num_processes; - - MPI_Isend(&device->vba.in[i][src_idx], count, datatype, send_pid, - i + NUM_VTXBUF_HANDLES, MPI_COMM_WORLD, - &send_requests[i + NUM_VTXBUF_HANDLES]); - } - } - MPI_Waitall(2 * NUM_VTXBUF_HANDLES, recv_requests, MPI_STATUSES_IGNORE); - MPI_Waitall(2 * NUM_VTXBUF_HANDLES, send_requests, MPI_STATUSES_IGNORE); - return AC_SUCCESS; -} - -// 1D decomp -static AcResult -acDeviceIntegrateStepMPI(const Device device, const AcReal dt) -{ - const int mx = device->local_config.int_params[AC_mx]; - const int my = device->local_config.int_params[AC_my]; - const int mz = device->local_config.int_params[AC_mz]; - const int nx = device->local_config.int_params[AC_nx]; - const int ny = device->local_config.int_params[AC_ny]; - const int nz = device->local_config.int_params[AC_nz]; - const size_t count = mx * my * NGHOST; - - for (int isubstep = 0; isubstep < 3; ++isubstep) { - acDeviceSynchronizeStream(device, STREAM_ALL); - // Local boundconds - for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { - // Front plate local - { - const int3 start = (int3){0, 0, NGHOST}; - const int3 end = (int3){mx, my, 2 * NGHOST}; - acDevicePeriodicBoundcondStep(device, (Stream)i, (VertexBufferHandle)i, start, end); - } - // Back plate local - { - const int3 start = (int3){0, 0, mz - 2 * NGHOST}; - const int3 end = (int3){mx, my, mz - NGHOST}; - acDevicePeriodicBoundcondStep(device, (Stream)i, (VertexBufferHandle)i, start, end); - } - } -#define INNER_BOUNDCOND_STREAM ((Stream)(NUM_STREAMS - 1)) - // Inner boundconds (while waiting) - for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { - - const int3 start = (int3){0, 0, 2 * NGHOST}; - const int3 end = (int3){mx, my, mz - 2 * NGHOST}; - acDevicePeriodicBoundcondStep(device, INNER_BOUNDCOND_STREAM, (VertexBufferHandle)i, - start, end); - } - - // MPI - MPI_Request send_requests[2 * NUM_VTXBUF_HANDLES]; - MPI_Request recv_requests[2 * NUM_VTXBUF_HANDLES]; - MPI_Datatype datatype = MPI_FLOAT; - if (sizeof(AcReal) == 8) - datatype = MPI_DOUBLE; - - int pid, num_processes; - MPI_Comm_rank(MPI_COMM_WORLD, &pid); - MPI_Comm_size(MPI_COMM_WORLD, &num_processes); - - for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { - { // Recv neighbor's front - // ...|ooooxxx|... -> xxx|ooooooo|... - const size_t dst_idx = acVertexBufferIdx(0, 0, 0, device->local_config); - const int recv_pid = (pid + num_processes - 1) % num_processes; - - MPI_Irecv(&device->vba.in[i][dst_idx], count, datatype, recv_pid, i, MPI_COMM_WORLD, - &recv_requests[i]); - } - { // Recv neighbor's back - // ...|ooooooo|xxx <- ...|xxxoooo|... - const size_t dst_idx = acVertexBufferIdx(0, 0, mz - NGHOST, device->local_config); - const int recv_pid = (pid + 1) % num_processes; - - MPI_Irecv(&device->vba.in[i][dst_idx], count, datatype, recv_pid, - NUM_VTXBUF_HANDLES + i, MPI_COMM_WORLD, - &recv_requests[i + NUM_VTXBUF_HANDLES]); - } - } - - for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { - acDeviceSynchronizeStream(device, (Stream)i); - { - // Send front - // ...|ooooxxx|... -> xxx|ooooooo|... - const size_t src_idx = acVertexBufferIdx(0, 0, mz - 2 * NGHOST, - device->local_config); - const int send_pid = (pid + 1) % num_processes; - - MPI_Isend(&device->vba.in[i][src_idx], count, datatype, send_pid, i, MPI_COMM_WORLD, - &send_requests[i]); - } - { // Send back - // ...|ooooooo|xxx <- ...|xxxoooo|... - const size_t src_idx = acVertexBufferIdx(0, 0, NGHOST, device->local_config); - const int send_pid = (pid + num_processes - 1) % num_processes; - - MPI_Isend(&device->vba.in[i][src_idx], count, datatype, send_pid, - i + NUM_VTXBUF_HANDLES, MPI_COMM_WORLD, - &send_requests[i + NUM_VTXBUF_HANDLES]); - } - } - // Inner integration - { - ERRCHK(NUM_STREAMS - 2 >= 0); - const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST}; - const int3 m2 = (int3){mx, my, mz} - m1; - acDeviceIntegrateSubstep(device, (Stream)(NUM_STREAMS - 2), isubstep, m1, m2, dt); - } - - MPI_Waitall(2 * NUM_VTXBUF_HANDLES, recv_requests, MPI_STATUSES_IGNORE); - MPI_Waitall(2 * NUM_VTXBUF_HANDLES, send_requests, MPI_STATUSES_IGNORE); - - acDeviceSynchronizeStream(device, INNER_BOUNDCOND_STREAM); - { // Front - const int3 m1 = (int3){NGHOST, NGHOST, NGHOST}; - const int3 m2 = m1 + (int3){nx, ny, NGHOST}; - acDeviceIntegrateSubstep(device, STREAM_0, isubstep, m1, m2, dt); - } - { // Back - const int3 m1 = (int3){NGHOST, NGHOST, nz}; - const int3 m2 = m1 + (int3){nx, ny, NGHOST}; - acDeviceIntegrateSubstep(device, STREAM_1, isubstep, m1, m2, dt); - } - { // Bottom - const int3 m1 = (int3){NGHOST, NGHOST, 2 * NGHOST}; - const int3 m2 = m1 + (int3){nx, NGHOST, nz - 2 * NGHOST}; - acDeviceIntegrateSubstep(device, STREAM_2, isubstep, m1, m2, dt); - } - { // Top - const int3 m1 = (int3){NGHOST, ny, 2 * NGHOST}; - const int3 m2 = m1 + (int3){nx, NGHOST, nz - 2 * NGHOST}; - acDeviceIntegrateSubstep(device, STREAM_3, isubstep, m1, m2, dt); - } - { // Left - const int3 m1 = (int3){NGHOST, 2 * NGHOST, 2 * NGHOST}; - const int3 m2 = m1 + (int3){NGHOST, ny - 2 * NGHOST, nz - 2 * NGHOST}; - acDeviceIntegrateSubstep(device, STREAM_4, isubstep, m1, m2, dt); - } - { // Right - const int3 m1 = (int3){nx, 2 * NGHOST, 2 * NGHOST}; - const int3 m2 = m1 + (int3){NGHOST, ny - 2 * NGHOST, nz - 2 * NGHOST}; - acDeviceIntegrateSubstep(device, STREAM_5, isubstep, m1, m2, dt); - } - MPI_Barrier(MPI_COMM_WORLD); - acDeviceSwapBuffers(device); - MPI_Barrier(MPI_COMM_WORLD); - } - - return AC_SUCCESS; -} -#endif - -static int3 -decompose(const int target) -{ - int decomposition[] = {1, 1, 1}; - - int axis = 0; - while (decomposition[0] * decomposition[1] * decomposition[2] < target) { - ++decomposition[axis]; - axis = (axis + 1) % 3; - } - - const int found = decomposition[0] * decomposition[1] * decomposition[2]; - if (found != target) { - fprintf(stderr, "Invalid number of processes! Cannot decompose the problem domain!\n"); - fprintf(stderr, "Target nprocs: %d. Next allowed: %d\n", target, found); - ERROR("Invalid nprocs"); - return (int3){-1, -1, -1}; - } - else { - return (int3){decomposition[0], decomposition[1], decomposition[2]}; - } -} - -static int3 -getPid3D(const int pid, const int3 decomposition) -{ - const int3 pid3d = (int3){ - mod(pid, decomposition.x), - mod(pid / decomposition.x, decomposition.y), - (pid / (decomposition.x * decomposition.y)), - }; - return pid3d; -} - -static int -getPid(const int3 pid, const int3 decomposition) -{ - return mod(pid.x, decomposition.x) + // - mod(pid.y, decomposition.y) * decomposition.x + // - mod(pid.z, decomposition.z) * decomposition.x * decomposition.y; -} - -static AcResult -acDeviceDistributeMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst) -{ - MPI_Barrier(MPI_COMM_WORLD); - printf("Distributing mesh...\n"); - fflush(stdout); - assert(dst); - - MPI_Datatype datatype = MPI_FLOAT; - if (sizeof(AcReal) == 8) - datatype = MPI_DOUBLE; - - int pid, nprocs; - MPI_Comm_rank(MPI_COMM_WORLD, &pid); - MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - - // Submesh nn - const int3 nn = (int3){ - dst->info.int_params[AC_nx], - dst->info.int_params[AC_ny], - dst->info.int_params[AC_nz], - }; - - // Submesh mm - const int3 mm = (int3){ - dst->info.int_params[AC_mx], - dst->info.int_params[AC_my], - dst->info.int_params[AC_mz], - }; - - // Send to self - if (pid == 0) { - for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) { - // For pencils - for (int k = NGHOST; k < NGHOST + nn.z; ++k) { - for (int j = NGHOST; j < NGHOST + nn.y; ++j) { - const int i = NGHOST; - const int count = nn.x; - const int src_idx = acVertexBufferIdx(i, j, k, src.info); - const int dst_idx = acVertexBufferIdx(i, j, k, dst->info); - memcpy(&dst->vertex_buffer[vtxbuf][dst_idx], // - &src.vertex_buffer[vtxbuf][src_idx], // - count * sizeof(src.vertex_buffer[i][0])); - } - } - } - } - - for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) { - // For pencils - for (int k = NGHOST; k < NGHOST + nn.z; ++k) { - for (int j = NGHOST; j < NGHOST + nn.y; ++j) { - const int i = NGHOST; - const int count = nn.x; - - if (pid != 0) { - const int dst_idx = acVertexBufferIdx(i, j, k, dst->info); - // Recv - MPI_Status status; - MPI_Recv(&dst->vertex_buffer[vtxbuf][dst_idx], count, datatype, 0, 0, - MPI_COMM_WORLD, &status); - } - else { - for (int tgt_pid = 1; tgt_pid < nprocs; ++tgt_pid) { - const int3 tgt_pid3d = getPid3D(tgt_pid, decomposition); - const int src_idx = acVertexBufferIdx(i + tgt_pid3d.x * nn.x, // - j + tgt_pid3d.y * nn.y, // - k + tgt_pid3d.z * nn.z, // - src.info); - - // Send - MPI_Send(&src.vertex_buffer[vtxbuf][src_idx], count, datatype, tgt_pid, 0, - MPI_COMM_WORLD); - } - } - } - } - } - return AC_SUCCESS; -} - -static AcResult -acDeviceGatherMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst) -{ - MPI_Barrier(MPI_COMM_WORLD); - printf("Gathering mesh...\n"); - fflush(stdout); - assert(dst); - - MPI_Datatype datatype = MPI_FLOAT; - if (sizeof(AcReal) == 8) - datatype = MPI_DOUBLE; - - int pid, nprocs; - MPI_Comm_rank(MPI_COMM_WORLD, &pid); - MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - - // Submesh nn - const int3 nn = (int3){ - src.info.int_params[AC_nx], - src.info.int_params[AC_ny], - src.info.int_params[AC_nz], - }; - - // Submesh mm - const int3 mm = (int3){ - src.info.int_params[AC_mx], - src.info.int_params[AC_my], - src.info.int_params[AC_mz], - }; - - // Send to self - if (pid == 0) { - for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) { - // For pencils - for (int k = 0; k < mm.z; ++k) { - for (int j = 0; j < mm.y; ++j) { - const int i = 0; - const int count = mm.x; - const int src_idx = acVertexBufferIdx(i, j, k, src.info); - const int dst_idx = acVertexBufferIdx(i, j, k, dst->info); - memcpy(&dst->vertex_buffer[vtxbuf][dst_idx], // - &src.vertex_buffer[vtxbuf][src_idx], // - count * sizeof(src.vertex_buffer[i][0])); - } - } - } - } - - for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) { - // For pencils - for (int k = 0; k < mm.z; ++k) { - for (int j = 0; j < mm.y; ++j) { - const int i = 0; - const int count = mm.x; - - if (pid != 0) { - // Send - const int src_idx = acVertexBufferIdx(i, j, k, src.info); - MPI_Send(&src.vertex_buffer[vtxbuf][src_idx], count, datatype, 0, 0, - MPI_COMM_WORLD); - } - else { - for (int tgt_pid = 1; tgt_pid < nprocs; ++tgt_pid) { - const int3 tgt_pid3d = getPid3D(tgt_pid, decomposition); - const int dst_idx = acVertexBufferIdx(i + tgt_pid3d.x * nn.x, // - j + tgt_pid3d.y * nn.y, // - k + tgt_pid3d.z * nn.z, // - dst->info); - - // Recv - MPI_Status status; - MPI_Recv(&dst->vertex_buffer[vtxbuf][dst_idx], count, datatype, tgt_pid, 0, - MPI_COMM_WORLD, &status); - } - } - } - } - } - return AC_SUCCESS; -} - -/* -// Close to correct 3D distribute/gather but deadlocks -static AcResult -acDeviceDistributeMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst) -{ - MPI_Barrier(MPI_COMM_WORLD); - printf("Distributing mesh...\n"); - assert(dst); - - MPI_Datatype datatype = MPI_FLOAT; - if (sizeof(AcReal) == 8) - datatype = MPI_DOUBLE; - - int pid, nprocs; - MPI_Comm_rank(MPI_COMM_WORLD, &pid); - MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - - // Submesh nn - const int3 nn = (int3){ - dst->info.int_params[AC_nx], - dst->info.int_params[AC_ny], - dst->info.int_params[AC_nz], - }; - - // Submesh mm - const int3 mm = (int3){ - dst->info.int_params[AC_mx], - dst->info.int_params[AC_my], - dst->info.int_params[AC_mz], - }; - - for (int tgt_pid = 0; tgt_pid < nprocs; ++tgt_pid) { - const int3 tgt_pid3d = getPid3D(tgt_pid, decomposition); - for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) { - - // For pencils - for (int k = NGHOST; k < NGHOST + nn.z; ++k) { - for (int j = NGHOST; j < NGHOST + nn.y; ++j) { - const int i = NGHOST; - const int count = nn.x; - const int src_idx = acVertexBufferIdx(i + tgt_pid3d.x * nn.x, // - j + tgt_pid3d.y * nn.y, // - k + tgt_pid3d.z * nn.z, // - src.info); - const int dst_idx = acVertexBufferIdx(i, j, k, dst->info); - - if (pid == 0) { - // Send - if (pid == tgt_pid) { - // Send to self - memcpy(&dst->vertex_buffer[vtxbuf][dst_idx], // - &src.vertex_buffer[vtxbuf][src_idx], // - count * sizeof(src.vertex_buffer[i][0])); - } - else { - // Send to others - MPI_Send(&src.vertex_buffer[vtxbuf][src_idx], count, datatype, tgt_pid, - vtxbuf + (j + k * (NGHOST + nn.y)) * NUM_VTXBUF_HANDLES, - MPI_COMM_WORLD); - } - } - else { - // Recv - MPI_Status status; - MPI_Recv(&dst->vertex_buffer[vtxbuf][dst_idx], count, datatype, 0, - vtxbuf + (j + k * (NGHOST + nn.y)) * NUM_VTXBUF_HANDLES, - MPI_COMM_WORLD, &status); - } - } - } - } - } - return AC_SUCCESS; -} - -static AcResult -acDeviceGatherMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst) -{ - MPI_Barrier(MPI_COMM_WORLD); - printf("Distributing mesh...\n"); - assert(dst); - - MPI_Datatype datatype = MPI_FLOAT; - if (sizeof(AcReal) == 8) - datatype = MPI_DOUBLE; - - int pid, nprocs; - MPI_Comm_rank(MPI_COMM_WORLD, &pid); - MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - - // Submesh nn - const int3 nn = (int3){ - dst->info.int_params[AC_nx], - dst->info.int_params[AC_ny], - dst->info.int_params[AC_nz], - }; - - // Submesh mm - const int3 mm = (int3){ - dst->info.int_params[AC_mx], - dst->info.int_params[AC_my], - dst->info.int_params[AC_mz], - }; - - for (int tgt_pid = 0; tgt_pid < nprocs; ++tgt_pid) { - const int3 tgt_pid3d = getPid3D(tgt_pid, decomposition); - for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) { - - // For pencils - for (int k = NGHOST; k < NGHOST + nn.z; ++k) { - for (int j = NGHOST; j < NGHOST + nn.y; ++j) { - const int i = NGHOST; - const int count = nn.x; - const int src_idx = acVertexBufferIdx(i, j, k, src.info); - const int dst_idx = acVertexBufferIdx(i + tgt_pid3d.x * nn.x, // - j + tgt_pid3d.y * nn.y, // - k + tgt_pid3d.z * nn.z, // - dst->info); - - if (pid == 0) { - // Send - if (pid == tgt_pid) { - // Send to self - memcpy(&dst->vertex_buffer[vtxbuf][dst_idx], // - &src.vertex_buffer[vtxbuf][src_idx], // - count * sizeof(src.vertex_buffer[i][0])); - } - else { - // Recv from others - MPI_Status status; - MPI_Recv(&dst->vertex_buffer[vtxbuf][dst_idx], count, datatype, tgt_pid, - vtxbuf, MPI_COMM_WORLD, &status); - } - } - else { - // Send to 0 - MPI_Send(&src.vertex_buffer[vtxbuf][src_idx], count, datatype, 0, vtxbuf, - MPI_COMM_WORLD); - } - } - } - } - } - return AC_SUCCESS; -} -*/ - -/* -static void -acDeviceDistributeMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst) -{ - MPI_Barrier(MPI_COMM_WORLD); - printf("Distributing mesh...\n"); - assert(dst); - - MPI_Datatype datatype = MPI_FLOAT; - if (sizeof(AcReal) == 8) - datatype = MPI_DOUBLE; - - int pid, num_processes; - MPI_Comm_rank(MPI_COMM_WORLD, &pid); - MPI_Comm_size(MPI_COMM_WORLD, &num_processes); - - // const size_t count = acVertexBufferSize(dst->info); - const int3 offset = (int3){ - src.info.int_params[AC_nx] / decomposition.x, - src.info.int_params[AC_ny] / decomposition.y, - src.info.int_params[AC_nz] / decomposition.z, - }; - for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { - - if (pid == 0) { - // Communicate to self - memcpy(&dst->vertex_buffer[i][0], // - &src.vertex_buffer[i][0], // - count * sizeof(src.vertex_buffer[i][0])); - - // Communicate to others - for (int tgt = 1; tgt < num_processes; ++tgt) { - const int3 target_pid = getPid3D(tgt, decomposition); - const size_t src_idx = acVertexBufferIdx(target_pid.x * offset.x, - target_pid.y * offset.y, - target_pid.z * offset.z, src.info); - - MPI_Send(&src.vertex_buffer[i][src_idx], count, datatype, tgt, 0, MPI_COMM_WORLD); - } - } - else { - - // Recv - const size_t dst_idx = 0; - MPI_Status status; - MPI_Recv(&dst->vertex_buffer[i][dst_idx], count, datatype, 0, 0, MPI_COMM_WORLD, - &status); - } - } -} - - -static void -acDeviceGatherMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst) -{ - MPI_Barrier(MPI_COMM_WORLD); - printf("Gathering mesh...\n"); - MPI_Datatype datatype = MPI_FLOAT; - if (sizeof(AcReal) == 8) - datatype = MPI_DOUBLE; - - int pid, num_processes; - MPI_Comm_rank(MPI_COMM_WORLD, &pid); - MPI_Comm_size(MPI_COMM_WORLD, &num_processes); - - size_t count = acVertexBufferSize(src.info); - - for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { - // Communicate to self - if (pid == 0) { - assert(dst); - memcpy(&dst->vertex_buffer[i][0], // - &src.vertex_buffer[i][0], // - count * sizeof(src.vertex_buffer[i][0])); - - for (int j = 1; j < num_processes; ++j) { - // Recv - // NOTE! Gathers halos also! Must be up to date! - // Not well-defined which GPU it gets the boundary from! - const int3 target_pid = getPid3D(j, decomposition); - const int3 offset = (int3){ - dst->info.int_params[AC_nx] / decomposition.x, - dst->info.int_params[AC_ny] / decomposition.y, - dst->info.int_params[AC_nz] / decomposition.z, - }; - const size_t dst_idx = acVertexBufferIdx(target_pid.x * offset.x, - target_pid.y * offset.y, - target_pid.z * offset.z, dst->info); - - assert(dst_idx + count <= acVertexBufferSize(dst->info)); - MPI_Status status; - MPI_Recv(&dst->vertex_buffer[i][dst_idx], count, datatype, j, 0, MPI_COMM_WORLD, - &status); - } - } - else { - // Send - const size_t src_idx = 0; - - assert(src_idx + count <= acVertexBufferSize(src.info)); - MPI_Send(&src.vertex_buffer[i][src_idx], count, datatype, 0, 0, MPI_COMM_WORLD); - } - } -} -*/ - -/* -static AcResult -acDeviceCommunicateHalosMPI(const Device device) -{ - acDeviceSynchronizeStream(device, STREAM_ALL); - - MPI_Datatype datatype = MPI_FLOAT; - if (sizeof(AcReal) == 8) - datatype = MPI_DOUBLE; - - int nprocs, pid; - MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - MPI_Comm_rank(MPI_COMM_WORLD, &pid); - const int3 decomp = decompose(nprocs); - - const int3 nn = (int3){ - device->local_config.int_params[AC_nx], - device->local_config.int_params[AC_ny], - device->local_config.int_params[AC_nz], - }; - - // Pack data - const int3 a0s[] = { - (int3){NGHOST, NGHOST, NGHOST}, // - (int3){nn.x, NGHOST, NGHOST}, // - (int3){NGHOST, nn.y, NGHOST}, // - (int3){nn.x, nn.y, NGHOST}, // - (int3){NGHOST, NGHOST, nn.z}, // - (int3){nn.x, NGHOST, nn.z}, // - (int3){NGHOST, nn.y, nn.z}, // - (int3){nn.x, nn.y, nn.z}, - }; - const int3 b0s[] = { - (int3){0, 0, 0}, - (int3){NGHOST + nn.x, 0, 0}, - (int3){0, NGHOST + nn.y, 0}, - (int3){NGHOST + nn.x, NGHOST + nn.y, 0}, - - (int3){0, 0, NGHOST + nn.z}, - (int3){NGHOST + nn.x, 0, NGHOST + nn.z}, - (int3){0, NGHOST + nn.y, NGHOST + nn.z}, - (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z}, - }; - - const cudaStream_t stream = device->streams[STREAM_DEFAULT]; - for (int i = 0; i < ARRAY_SIZE(a0s); ++i) - acKernelPackData(stream, device->vba, a0s[i], device->corners_send[i]); - - MPI_Barrier(MPI_COMM_WORLD); - acDeviceSynchronizeStream(device, STREAM_ALL); // TODO debug remove - MPI_Barrier(MPI_COMM_WORLD); - - for (int k = -1; k <= 1; ++k) { - for (int j = -1; j <= 1; ++j) { - for (int i = -1; i <= 1; ++i) { - if (i == 0 && j == 0 && k == 0) - continue; - - const int3 neighbor = (int3){i, j, k}; - - for (size_t a_idx = 0; a_idx < ARRAY_SIZE(a0s); ++a_idx) { - const int3 a0 = a0s[a_idx]; - const int3 a1 = a0 + device->corners_send[a_idx].dims; - - const int3 nmin = (int3){NGHOST, NGHOST, NGHOST}; - const int3 nmax = nmin + nn + (int3){1, 1, 1}; - ERRCHK_ALWAYS(isWithin(a1, nmin, nmax)); - - const int3 b0 = a0 - neighbor * nn; - const int3 b1 = a1 - neighbor * nn; - - const int3 mmin = (int3){0, 0, 0}; - const int3 mmax = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST} + nn; - if (isWithin(b0, mmin, mmax) && isWithin(b1, mmin, mmax + (int3){1, 1, 1})) { - printf("neighbor "); - print_int3(neighbor); - printf("\n"); - printf("\tb0: "); - print_int3(b0); - printf("\n"); - - for (size_t b_idx = 0; b_idx < ARRAY_SIZE(b0s); ++b_idx) { - if (b0s[b_idx].x == b0.x && b0s[b_idx].y == b0.y && - b0s[b_idx].z == b0.z) { - - ERRCHK_ALWAYS(a_idx < NUM_CORNERS); - ERRCHK_ALWAYS(b_idx < NUM_CORNERS); - - const int3 pid3d = getPid3D(pid, decomp); - const size_t count = device->corners_send[a_idx].dims.x * - device->corners_send[a_idx].dims.y * - device->corners_send[a_idx].dims.z * - NUM_VTXBUF_HANDLES; - - MPI_Request send_req, recv_req; - MPI_Isend((device->corners_send[a_idx]).data, count, datatype, - getPid(pid3d + neighbor, decomp), b_idx, MPI_COMM_WORLD, - &send_req); - - MPI_Irecv((device->corners_recv[b_idx]).data, count, datatype, - getPid(pid3d - neighbor, decomp), b_idx, MPI_COMM_WORLD, - &recv_req); - - MPI_Status status; - MPI_Wait(&recv_req, &status); - - break; - } - } - } - } - } - } - } - printf("------------------\n"); - - MPI_Barrier(MPI_COMM_WORLD); - acDeviceSynchronizeStream(device, STREAM_ALL); - // Unpack data - for (int i = 0; i < ARRAY_SIZE(b0s); ++i) { - acKernelUnpackData(stream, device->corners_recv[i], b0s[i], device->vba); - } - - return AC_SUCCESS; -}*/ - -static AcResult -acDeviceCommunicateCornersMPI(const Device device) -{ - cudaSetDevice(device->id); - acDeviceSynchronizeStream(device, STREAM_ALL); - MPI_Barrier(MPI_COMM_WORLD); - - MPI_Datatype datatype = MPI_FLOAT; - if (sizeof(AcReal) == 8) - datatype = MPI_DOUBLE; - - int nprocs, pid; - MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - MPI_Comm_rank(MPI_COMM_WORLD, &pid); - const int3 decomp = decompose(nprocs); - - const int3 nn = (int3){ - device->local_config.int_params[AC_nx], - device->local_config.int_params[AC_ny], - device->local_config.int_params[AC_nz], - }; - - // Pack data - const int3 a0s[] = { - (int3){NGHOST, NGHOST, NGHOST}, // - (int3){nn.x, NGHOST, NGHOST}, // - (int3){NGHOST, nn.y, NGHOST}, // - (int3){nn.x, nn.y, NGHOST}, // - - (int3){NGHOST, NGHOST, nn.z}, // - (int3){nn.x, NGHOST, nn.z}, // - (int3){NGHOST, nn.y, nn.z}, // - (int3){nn.x, nn.y, nn.z}, - }; - const int3 b0s[] = { - (int3){0, 0, 0}, - (int3){NGHOST + nn.x, 0, 0}, - (int3){0, NGHOST + nn.y, 0}, - (int3){NGHOST + nn.x, NGHOST + nn.y, 0}, - - (int3){0, 0, NGHOST + nn.z}, - (int3){NGHOST + nn.x, 0, NGHOST + nn.z}, - (int3){0, NGHOST + nn.y, NGHOST + nn.z}, - (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z}, - }; - - const int3 dims = (int3){NGHOST, NGHOST, NGHOST}; - - for (int k = -1; k <= 1; ++k) { - for (int j = -1; j <= 1; ++j) { - for (int i = -1; i <= 1; ++i) { - if (i == 0 && j == 0 && k == 0) - continue; - - for (size_t a_idx = 0; a_idx < ARRAY_SIZE(a0s); ++a_idx) { - for (size_t b_idx = 0; b_idx < ARRAY_SIZE(b0s); ++b_idx) { - const int3 neighbor = (int3){i, j, k}; - - const int3 a0 = a0s[a_idx]; - // const int3 a1 = a0 + dims; - - const int3 b0 = a0 - neighbor * nn; - // const int3 b1 = a1 - neighbor * nn; - - if (b0s[b_idx].x == b0.x && b0s[b_idx].y == b0.y && b0s[b_idx].z == b0.z) { - - const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES; - - PackedData src = acCreatePackedData(dims); - PackedData dst = acCreatePackedData(dims); - - const cudaStream_t stream = device->streams[STREAM_DEFAULT]; - acKernelPackData(stream, device->vba, a0, src); - acDeviceSynchronizeStream(device, STREAM_DEFAULT); - - // Host //////////////////////////////////////////////// - PackedData src_host = acCreatePackedDataHost(dims); - PackedData dst_host = acCreatePackedDataHost(dims); - acTransferPackedDataToHost(src, &src_host); - acDeviceSynchronizeStream(device, STREAM_ALL); - MPI_Barrier(MPI_COMM_WORLD); - //////////////////////////////////////////////////////// - - const int3 pid3d = getPid3D(pid, decomp); - MPI_Request send_req, recv_req; - MPI_Isend(src_host.data, count, datatype, - getPid(pid3d + neighbor, decomp), b_idx, MPI_COMM_WORLD, - &send_req); - MPI_Irecv(dst_host.data, count, datatype, - getPid(pid3d - neighbor, decomp), b_idx, MPI_COMM_WORLD, - &recv_req); - - MPI_Wait(&send_req, MPI_STATUS_IGNORE); - MPI_Wait(&recv_req, MPI_STATUS_IGNORE); - - // Host //////////////////////////////////////////////// - acTransferPackedDataToDevice(dst_host, &dst); - acDeviceSynchronizeStream(device, STREAM_ALL); - acDestroyPackedDataHost(&src_host); - acDestroyPackedDataHost(&dst_host); - //////////////////////////////////////////////////////// - - acKernelUnpackData(stream, dst, b0, device->vba); - acDeviceSynchronizeStream(device, STREAM_DEFAULT); - - acDestroyPackedData(&src); - acDestroyPackedData(&dst); - } - } - } - } - } - } - - return AC_SUCCESS; -} - -static AcResult -acDeviceCommunicateEdgesMPI(const Device device) -{ - cudaSetDevice(device->id); - acDeviceSynchronizeStream(device, STREAM_ALL); - MPI_Barrier(MPI_COMM_WORLD); - - MPI_Datatype datatype = MPI_FLOAT; - if (sizeof(AcReal) == 8) - datatype = MPI_DOUBLE; - - int nprocs, pid; - MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - MPI_Comm_rank(MPI_COMM_WORLD, &pid); - const int3 decomp = decompose(nprocs); - - const int3 nn = (int3){ - device->local_config.int_params[AC_nx], - device->local_config.int_params[AC_ny], - device->local_config.int_params[AC_nz], - }; - - { - // X-axis - // Pack data - const int3 a0s[] = { - (int3){NGHOST, NGHOST, NGHOST}, // - (int3){NGHOST, nn.y, NGHOST}, // - - (int3){NGHOST, NGHOST, nn.z}, // - (int3){NGHOST, nn.y, nn.z}, // - }; - const int3 b0s[] = { - (int3){NGHOST, 0, 0}, - (int3){NGHOST, NGHOST + nn.y, 0}, - - (int3){NGHOST, 0, NGHOST + nn.z}, - (int3){NGHOST, NGHOST + nn.y, NGHOST + nn.z}, - }; - - const int3 dims = (int3){nn.x, NGHOST, NGHOST}; - - for (int k = -1; k <= 1; ++k) { - for (int j = -1; j <= 1; ++j) { - for (int i = -1; i <= 1; ++i) { - if (i == 0 && j == 0 && k == 0) - continue; - - for (size_t a_idx = 0; a_idx < ARRAY_SIZE(a0s); ++a_idx) { - for (size_t b_idx = 0; b_idx < ARRAY_SIZE(b0s); ++b_idx) { - const int3 neighbor = (int3){i, j, k}; - - const int3 a0 = a0s[a_idx]; - // const int3 a1 = a0 + dims; - - const int3 b0 = a0 - neighbor * nn; - // const int3 b1 = a1 - neighbor * nn; - - if (b0s[b_idx].x == b0.x && b0s[b_idx].y == b0.y && - b0s[b_idx].z == b0.z) { - - const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES; - - PackedData src = acCreatePackedData(dims); - PackedData dst = acCreatePackedData(dims); - - const cudaStream_t stream = device->streams[STREAM_DEFAULT]; - acKernelPackData(stream, device->vba, a0, src); - acDeviceSynchronizeStream(device, STREAM_DEFAULT); - - // Host //////////////////////////////////////////////// - PackedData src_host = acCreatePackedDataHost(dims); - PackedData dst_host = acCreatePackedDataHost(dims); - acTransferPackedDataToHost(src, &src_host); - acDeviceSynchronizeStream(device, STREAM_ALL); - //////////////////////////////////////////////////////// - - const int3 pid3d = getPid3D(pid, decomp); - MPI_Request send_req, recv_req; - MPI_Isend(src_host.data, count, datatype, - getPid(pid3d + neighbor, decomp), b_idx, MPI_COMM_WORLD, - &send_req); - MPI_Irecv(dst_host.data, count, datatype, - getPid(pid3d - neighbor, decomp), b_idx, MPI_COMM_WORLD, - &recv_req); - - MPI_Wait(&send_req, MPI_STATUS_IGNORE); - MPI_Wait(&recv_req, MPI_STATUS_IGNORE); - - // Host //////////////////////////////////////////////// - acTransferPackedDataToDevice(dst_host, &dst); - acDeviceSynchronizeStream(device, STREAM_ALL); - acDestroyPackedDataHost(&src_host); - acDestroyPackedDataHost(&dst_host); - //////////////////////////////////////////////////////// - - acKernelUnpackData(stream, dst, b0, device->vba); - acDeviceSynchronizeStream(device, STREAM_DEFAULT); - - acDestroyPackedData(&src); - acDestroyPackedData(&dst); - } - } - } - } - } - } - } - - { - // Y-axis - // Pack data - const int3 a0s[] = { - (int3){NGHOST, NGHOST, NGHOST}, // - (int3){nn.x, NGHOST, NGHOST}, // - - (int3){NGHOST, NGHOST, nn.z}, // - (int3){nn.x, NGHOST, nn.z}, // - }; - const int3 b0s[] = { - (int3){0, NGHOST, 0}, - (int3){NGHOST + nn.x, NGHOST, 0}, - - (int3){0, NGHOST, NGHOST + nn.z}, - (int3){NGHOST + nn.x, NGHOST, NGHOST + nn.z}, - }; - - const int3 dims = (int3){NGHOST, nn.y, NGHOST}; - - for (int k = -1; k <= 1; ++k) { - for (int j = -1; j <= 1; ++j) { - for (int i = -1; i <= 1; ++i) { - if (i == 0 && j == 0 && k == 0) - continue; - - for (size_t a_idx = 0; a_idx < ARRAY_SIZE(a0s); ++a_idx) { - for (size_t b_idx = 0; b_idx < ARRAY_SIZE(b0s); ++b_idx) { - const int3 neighbor = (int3){i, j, k}; - - const int3 a0 = a0s[a_idx]; - // const int3 a1 = a0 + dims; - - const int3 b0 = a0 - neighbor * nn; - // const int3 b1 = a1 - neighbor * nn; - - if (b0s[b_idx].x == b0.x && b0s[b_idx].y == b0.y && - b0s[b_idx].z == b0.z) { - - const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES; - - PackedData src = acCreatePackedData(dims); - PackedData dst = acCreatePackedData(dims); - - const cudaStream_t stream = device->streams[STREAM_DEFAULT]; - acKernelPackData(stream, device->vba, a0, src); - acDeviceSynchronizeStream(device, STREAM_DEFAULT); - - // Host //////////////////////////////////////////////// - PackedData src_host = acCreatePackedDataHost(dims); - PackedData dst_host = acCreatePackedDataHost(dims); - acTransferPackedDataToHost(src, &src_host); - acDeviceSynchronizeStream(device, STREAM_ALL); - //////////////////////////////////////////////////////// - - const int3 pid3d = getPid3D(pid, decomp); - MPI_Request send_req, recv_req; - MPI_Isend(src_host.data, count, datatype, - getPid(pid3d + neighbor, decomp), b_idx, MPI_COMM_WORLD, - &send_req); - MPI_Irecv(dst_host.data, count, datatype, - getPid(pid3d - neighbor, decomp), b_idx, MPI_COMM_WORLD, - &recv_req); - - MPI_Wait(&send_req, MPI_STATUS_IGNORE); - MPI_Wait(&recv_req, MPI_STATUS_IGNORE); - - // Host //////////////////////////////////////////////// - acTransferPackedDataToDevice(dst_host, &dst); - acDeviceSynchronizeStream(device, STREAM_ALL); - acDestroyPackedDataHost(&src_host); - acDestroyPackedDataHost(&dst_host); - //////////////////////////////////////////////////////// - - acKernelUnpackData(stream, dst, b0, device->vba); - acDeviceSynchronizeStream(device, STREAM_DEFAULT); - - acDestroyPackedData(&src); - acDestroyPackedData(&dst); - } - } - } - } - } - } - } - - { - // Z-axis - // Pack data - const int3 a0s[] = { - (int3){NGHOST, NGHOST, NGHOST}, // - (int3){nn.x, NGHOST, NGHOST}, // - - (int3){NGHOST, nn.y, NGHOST}, // - (int3){nn.x, nn.y, NGHOST}, // - }; - const int3 b0s[] = { - (int3){0, 0, NGHOST}, - (int3){NGHOST + nn.x, 0, NGHOST}, - - (int3){0, NGHOST + nn.y, NGHOST}, - (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST}, - }; - - const int3 dims = (int3){NGHOST, NGHOST, nn.z}; - - for (int k = -1; k <= 1; ++k) { - for (int j = -1; j <= 1; ++j) { - for (int i = -1; i <= 1; ++i) { - if (i == 0 && j == 0 && k == 0) - continue; - - for (size_t a_idx = 0; a_idx < ARRAY_SIZE(a0s); ++a_idx) { - for (size_t b_idx = 0; b_idx < ARRAY_SIZE(b0s); ++b_idx) { - const int3 neighbor = (int3){i, j, k}; - - const int3 a0 = a0s[a_idx]; - // const int3 a1 = a0 + dims; - - const int3 b0 = a0 - neighbor * nn; - // const int3 b1 = a1 - neighbor * nn; - - if (b0s[b_idx].x == b0.x && b0s[b_idx].y == b0.y && - b0s[b_idx].z == b0.z) { - - const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES; - - PackedData src = acCreatePackedData(dims); - PackedData dst = acCreatePackedData(dims); - - const cudaStream_t stream = device->streams[STREAM_DEFAULT]; - acKernelPackData(stream, device->vba, a0, src); - acDeviceSynchronizeStream(device, STREAM_DEFAULT); - - // Host //////////////////////////////////////////////// - PackedData src_host = acCreatePackedDataHost(dims); - PackedData dst_host = acCreatePackedDataHost(dims); - acTransferPackedDataToHost(src, &src_host); - acDeviceSynchronizeStream(device, STREAM_ALL); - //////////////////////////////////////////////////////// - - const int3 pid3d = getPid3D(pid, decomp); - MPI_Request send_req, recv_req; - MPI_Isend(src_host.data, count, datatype, - getPid(pid3d + neighbor, decomp), b_idx, MPI_COMM_WORLD, - &send_req); - MPI_Irecv(dst_host.data, count, datatype, - getPid(pid3d - neighbor, decomp), b_idx, MPI_COMM_WORLD, - &recv_req); - - MPI_Wait(&send_req, MPI_STATUS_IGNORE); - MPI_Wait(&recv_req, MPI_STATUS_IGNORE); - - // Host //////////////////////////////////////////////// - acTransferPackedDataToDevice(dst_host, &dst); - acDeviceSynchronizeStream(device, STREAM_ALL); - acDestroyPackedDataHost(&src_host); - acDestroyPackedDataHost(&dst_host); - //////////////////////////////////////////////////////// - - acKernelUnpackData(stream, dst, b0, device->vba); - acDeviceSynchronizeStream(device, STREAM_DEFAULT); - - acDestroyPackedData(&src); - acDestroyPackedData(&dst); - } - } - } - } - } - } - } - - return AC_SUCCESS; -} - -static AcResult -acDeviceCommunicateSidesMPI(const Device device) -{ - cudaSetDevice(device->id); - acDeviceSynchronizeStream(device, STREAM_ALL); - MPI_Barrier(MPI_COMM_WORLD); - - MPI_Datatype datatype = MPI_FLOAT; - if (sizeof(AcReal) == 8) - datatype = MPI_DOUBLE; - - int nprocs, pid; - MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - MPI_Comm_rank(MPI_COMM_WORLD, &pid); - const int3 decomp = decompose(nprocs); - - const int3 nn = (int3){ - device->local_config.int_params[AC_nx], - device->local_config.int_params[AC_ny], - device->local_config.int_params[AC_nz], - }; - - { - // XY-axis - // Pack data - const int3 a0s[] = { - (int3){NGHOST, NGHOST, NGHOST}, // - (int3){NGHOST, NGHOST, nn.z}, // - }; - const int3 b0s[] = { - (int3){NGHOST, NGHOST, 0}, // - (int3){NGHOST, NGHOST, NGHOST + nn.z}, // - }; - - const int3 dims = (int3){nn.x, nn.y, NGHOST}; - - for (int k = -1; k <= 1; ++k) { - for (int j = -1; j <= 1; ++j) { - for (int i = -1; i <= 1; ++i) { - if (i == 0 && j == 0 && k == 0) - continue; - - for (size_t a_idx = 0; a_idx < ARRAY_SIZE(a0s); ++a_idx) { - for (size_t b_idx = 0; b_idx < ARRAY_SIZE(b0s); ++b_idx) { - const int3 neighbor = (int3){i, j, k}; - - const int3 a0 = a0s[a_idx]; - // const int3 a1 = a0 + dims; - - const int3 b0 = a0 - neighbor * nn; - // const int3 b1 = a1 - neighbor * nn; - - if (b0s[b_idx].x == b0.x && b0s[b_idx].y == b0.y && - b0s[b_idx].z == b0.z) { - - const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES; - - PackedData src = acCreatePackedData(dims); - PackedData dst = acCreatePackedData(dims); - - const cudaStream_t stream = device->streams[STREAM_DEFAULT]; - acKernelPackData(stream, device->vba, a0, src); - acDeviceSynchronizeStream(device, STREAM_DEFAULT); - - // Host //////////////////////////////////////////////// - PackedData src_host = acCreatePackedDataHost(dims); - PackedData dst_host = acCreatePackedDataHost(dims); - acTransferPackedDataToHost(src, &src_host); - acDeviceSynchronizeStream(device, STREAM_ALL); - //////////////////////////////////////////////////////// - - const int3 pid3d = getPid3D(pid, decomp); - MPI_Request send_req, recv_req; - MPI_Isend(src_host.data, count, datatype, - getPid(pid3d + neighbor, decomp), b_idx, MPI_COMM_WORLD, - &send_req); - MPI_Irecv(dst_host.data, count, datatype, - getPid(pid3d - neighbor, decomp), b_idx, MPI_COMM_WORLD, - &recv_req); - - MPI_Wait(&send_req, MPI_STATUS_IGNORE); - MPI_Wait(&recv_req, MPI_STATUS_IGNORE); - - // Host //////////////////////////////////////////////// - acTransferPackedDataToDevice(dst_host, &dst); - acDeviceSynchronizeStream(device, STREAM_ALL); - acDestroyPackedDataHost(&src_host); - acDestroyPackedDataHost(&dst_host); - //////////////////////////////////////////////////////// - - acKernelUnpackData(stream, dst, b0, device->vba); - acDeviceSynchronizeStream(device, STREAM_DEFAULT); - - acDestroyPackedData(&src); - acDestroyPackedData(&dst); - } - } - } - } - } - } - } - - { - // XZ-axis - // Pack data - const int3 a0s[] = { - (int3){NGHOST, NGHOST, NGHOST}, // - (int3){NGHOST, nn.y, NGHOST}, // - }; - const int3 b0s[] = { - (int3){NGHOST, 0, NGHOST}, // - (int3){NGHOST, NGHOST + nn.y, NGHOST}, // - }; - const int3 dims = (int3){nn.x, NGHOST, nn.z}; - - for (int k = -1; k <= 1; ++k) { - for (int j = -1; j <= 1; ++j) { - for (int i = -1; i <= 1; ++i) { - if (i == 0 && j == 0 && k == 0) - continue; - - for (size_t a_idx = 0; a_idx < ARRAY_SIZE(a0s); ++a_idx) { - for (size_t b_idx = 0; b_idx < ARRAY_SIZE(b0s); ++b_idx) { - const int3 neighbor = (int3){i, j, k}; - - const int3 a0 = a0s[a_idx]; - // const int3 a1 = a0 + dims; - - const int3 b0 = a0 - neighbor * nn; - // const int3 b1 = a1 - neighbor * nn; - - if (b0s[b_idx].x == b0.x && b0s[b_idx].y == b0.y && - b0s[b_idx].z == b0.z) { - - const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES; - - PackedData src = acCreatePackedData(dims); - PackedData dst = acCreatePackedData(dims); - - const cudaStream_t stream = device->streams[STREAM_DEFAULT]; - acKernelPackData(stream, device->vba, a0, src); - acDeviceSynchronizeStream(device, STREAM_DEFAULT); - - // Host //////////////////////////////////////////////// - PackedData src_host = acCreatePackedDataHost(dims); - PackedData dst_host = acCreatePackedDataHost(dims); - acTransferPackedDataToHost(src, &src_host); - acDeviceSynchronizeStream(device, STREAM_ALL); - //////////////////////////////////////////////////////// - - const int3 pid3d = getPid3D(pid, decomp); - MPI_Request send_req, recv_req; - MPI_Isend(src_host.data, count, datatype, - getPid(pid3d + neighbor, decomp), b_idx, MPI_COMM_WORLD, - &send_req); - MPI_Irecv(dst_host.data, count, datatype, - getPid(pid3d - neighbor, decomp), b_idx, MPI_COMM_WORLD, - &recv_req); - - MPI_Wait(&send_req, MPI_STATUS_IGNORE); - MPI_Wait(&recv_req, MPI_STATUS_IGNORE); - - // Host //////////////////////////////////////////////// - acTransferPackedDataToDevice(dst_host, &dst); - acDeviceSynchronizeStream(device, STREAM_ALL); - acDestroyPackedDataHost(&src_host); - acDestroyPackedDataHost(&dst_host); - //////////////////////////////////////////////////////// - - acKernelUnpackData(stream, dst, b0, device->vba); - acDeviceSynchronizeStream(device, STREAM_DEFAULT); - - acDestroyPackedData(&src); - acDestroyPackedData(&dst); - } - } - } - } - } - } - } - - { - // YZ-axis - // Pack data - const int3 a0s[] = { - (int3){NGHOST, NGHOST, NGHOST}, // - (int3){nn.x, NGHOST, NGHOST}, // - }; - const int3 b0s[] = { - (int3){0, NGHOST, NGHOST}, // - (int3){NGHOST + nn.x, NGHOST, NGHOST}, // - }; - const int3 dims = (int3){NGHOST, nn.y, nn.z}; - - for (int k = -1; k <= 1; ++k) { - for (int j = -1; j <= 1; ++j) { - for (int i = -1; i <= 1; ++i) { - if (i == 0 && j == 0 && k == 0) - continue; - - for (size_t a_idx = 0; a_idx < ARRAY_SIZE(a0s); ++a_idx) { - for (size_t b_idx = 0; b_idx < ARRAY_SIZE(b0s); ++b_idx) { - const int3 neighbor = (int3){i, j, k}; - - const int3 a0 = a0s[a_idx]; - // const int3 a1 = a0 + dims; - - const int3 b0 = a0 - neighbor * nn; - // const int3 b1 = a1 - neighbor * nn; - - if (b0s[b_idx].x == b0.x && b0s[b_idx].y == b0.y && - b0s[b_idx].z == b0.z) { - - const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES; - - PackedData src = acCreatePackedData(dims); - PackedData dst = acCreatePackedData(dims); - - const cudaStream_t stream = device->streams[STREAM_DEFAULT]; - acKernelPackData(stream, device->vba, a0, src); - acDeviceSynchronizeStream(device, STREAM_DEFAULT); - - // Host //////////////////////////////////////////////// - PackedData src_host = acCreatePackedDataHost(dims); - PackedData dst_host = acCreatePackedDataHost(dims); - acTransferPackedDataToHost(src, &src_host); - acDeviceSynchronizeStream(device, STREAM_ALL); - //////////////////////////////////////////////////////// - - const int3 pid3d = getPid3D(pid, decomp); - MPI_Request send_req, recv_req; - MPI_Isend(src_host.data, count, datatype, - getPid(pid3d + neighbor, decomp), b_idx, MPI_COMM_WORLD, - &send_req); - MPI_Irecv(dst_host.data, count, datatype, - getPid(pid3d - neighbor, decomp), b_idx, MPI_COMM_WORLD, - &recv_req); - - MPI_Wait(&send_req, MPI_STATUS_IGNORE); - MPI_Wait(&recv_req, MPI_STATUS_IGNORE); - - // Host //////////////////////////////////////////////// - acTransferPackedDataToDevice(dst_host, &dst); - acDeviceSynchronizeStream(device, STREAM_ALL); - acDestroyPackedDataHost(&src_host); - acDestroyPackedDataHost(&dst_host); - //////////////////////////////////////////////////////// - - acKernelUnpackData(stream, dst, b0, device->vba); - acDeviceSynchronizeStream(device, STREAM_DEFAULT); - - acDestroyPackedData(&src); - acDestroyPackedData(&dst); - } - } - } - } - } - } - } - - return AC_SUCCESS; -} - -static AcResult -acDeviceCommunicateHalosMPI(const Device device) -{ - acDeviceCommunicateCornersMPI(device); - acDeviceCommunicateEdgesMPI(device); - acDeviceCommunicateSidesMPI(device); - return AC_SUCCESS; -} -/* -static AcResult -acDeviceCommunicateHalosMPI(const Device device) -{ - acDeviceSynchronizeStream(device, STREAM_ALL); - MPI_Barrier(MPI_COMM_WORLD); - - MPI_Datatype datatype = MPI_FLOAT; - if (sizeof(AcReal) == 8) - datatype = MPI_DOUBLE; - - int nprocs, pid; - MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - MPI_Comm_rank(MPI_COMM_WORLD, &pid); - const int3 decomp = decompose(nprocs); - - const int3 nn = (int3){ - device->local_config.int_params[AC_nx], - device->local_config.int_params[AC_ny], - device->local_config.int_params[AC_nz], - }; - - // Pack data - const int3 a0s[] = { - (int3){NGHOST, NGHOST, NGHOST}, // - (int3){nn.x, NGHOST, NGHOST}, // - (int3){NGHOST, nn.y, NGHOST}, // - (int3){nn.x, nn.y, NGHOST}, // - (int3){NGHOST, NGHOST, nn.z}, // - (int3){nn.x, NGHOST, nn.z}, // - (int3){NGHOST, nn.y, nn.z}, // - (int3){nn.x, nn.y, nn.z}, - }; - const int3 b0s[] = { - (int3){0, 0, 0}, - (int3){NGHOST + nn.x, 0, 0}, - (int3){0, NGHOST + nn.y, 0}, - (int3){NGHOST + nn.x, NGHOST + nn.y, 0}, - - (int3){0, 0, NGHOST + nn.z}, - (int3){NGHOST + nn.x, 0, NGHOST + nn.z}, - (int3){0, NGHOST + nn.y, NGHOST + nn.z}, - (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z}, - }; - - for (int k = -1; k <= 1; ++k) { - for (int j = -1; j <= 1; ++j) { - for (int i = -1; i <= 1; ++i) { - if (i == 0 && j == 0 && k == 0) - continue; - - const int3 neighbor = (int3){i, j, k}; - - for (size_t a_idx = 0; a_idx < ARRAY_SIZE(a0s); ++a_idx) { - const int3 a0 = a0s[a_idx]; - const int3 a1 = a0 + device->corners_send[a_idx].dims; - - const int3 nmin = (int3){NGHOST, NGHOST, NGHOST}; - const int3 nmax = nmin + nn + (int3){1, 1, 1}; - ERRCHK_ALWAYS(isWithin(a1, nmin, nmax)); - - const int3 b0 = a0 - neighbor * nn; - const int3 b1 = a1 - neighbor * nn; - - const int3 mmin = (int3){0, 0, 0}; - const int3 mmax = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST} + nn; - if (isWithin(b0, mmin, mmax) && isWithin(b1, mmin, mmax + (int3){1, 1, 1})) { - printf("neighbor "); - print_int3(neighbor); - printf("\n"); - printf("\tb0: "); - print_int3(b0); - printf("\n"); - - for (size_t b_idx = 0; b_idx < ARRAY_SIZE(b0s); ++b_idx) { - if (b0s[b_idx].x == b0.x && b0s[b_idx].y == b0.y && - b0s[b_idx].z == b0.z) { - - ERRCHK_ALWAYS(a_idx < NUM_CORNERS); - ERRCHK_ALWAYS(b_idx < NUM_CORNERS); - const int3 pid3d = getPid3D(pid, decomp); - - const int3 dims = device->corners_send[a_idx].dims; - const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES; - - const cudaStream_t stream = device->streams[STREAM_DEFAULT]; - - AcReal *src, *dst; - ERRCHK_CUDA_ALWAYS( - cudaMalloc((void**)&src, count * sizeof(src[0]))); - ERRCHK_CUDA_ALWAYS( - cudaMalloc((void**)&dst, count * sizeof(dst[0]))); - - PackedData srcdata = {dims, src}; - acKernelPackData(stream, device->vba, a0, srcdata); - acDeviceSynchronizeStream(device, STREAM_ALL); - - MPI_Request send_req, recv_req; - MPI_Isend(src, count, datatype, getPid(pid3d + neighbor, decomp), - b_idx, MPI_COMM_WORLD, &send_req); - MPI_Irecv(dst, count, datatype, getPid(pid3d - neighbor, decomp), - b_idx, MPI_COMM_WORLD, &recv_req); - - MPI_Status status; - MPI_Wait(&recv_req, &status); - - PackedData dstdata = {dims, dst}; - acKernelUnpackData(stream, dstdata, b0, device->vba); - acDeviceSynchronizeStream(device, STREAM_ALL); - cudaFree(src); - cudaFree(dst); - - printf("Sent!\n"); - break; - } - } - } - } - } - } - } - printf("------------------\n"); - return AC_SUCCESS; -}*/ - -// From Astaroth Utils -#include "src/utils/config_loader.h" -#include "src/utils/memory.h" -#include "src/utils/modelsolver.h" -#include "src/utils/timer_hires.h" -#include "src/utils/verification.h" - -#include -#include -// --smpiargs="-gpu" - -// 3D decomposition -AcResult -acDeviceRunMPITest(void) -{ - // If 1, runs the strong scaling tests and verification. - // Verification is disabled when benchmarking weak scaling because the - // whole grid is too large to fit into memory. -#define BENCH_STRONG_SCALING (1) - - MPI_Init(NULL, NULL); - - int nprocs, pid; - MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - MPI_Comm_rank(MPI_COMM_WORLD, &pid); - - char processor_name[MPI_MAX_PROCESSOR_NAME]; - int name_len; - MPI_Get_processor_name(processor_name, &name_len); - printf("Processor %s. Process %d of %d.\n", processor_name, pid, nprocs); - - // Create model and candidate meshes - AcMeshInfo info; - acLoadConfig(AC_DEFAULT_CONFIG, &info); - - // Large mesh dim - const int nn = 64; - info.int_params[AC_nx] = info.int_params[AC_ny] = nn; - info.int_params[AC_nz] = BENCH_STRONG_SCALING ? nn : nn * nprocs; - info.real_params[AC_inv_dsx] = AcReal(1.0) / info.real_params[AC_dsx]; - info.real_params[AC_inv_dsy] = AcReal(1.0) / info.real_params[AC_dsy]; - info.real_params[AC_inv_dsz] = AcReal(1.0) / info.real_params[AC_dsz]; - info.real_params[AC_cs2_sound] = info.real_params[AC_cs_sound] * info.real_params[AC_cs_sound]; - acUpdateConfig(&info); - acPrintMeshInfo(info); - -#if BENCH_STRONG_SCALING - AcMesh model, candidate; - - // Master CPU - if (pid == 0) { - acMeshCreate(info, &model); - acMeshCreate(info, &candidate); - - acMeshRandomize(&model); - acMeshRandomize(&candidate); - } -#endif - - /// DECOMPOSITION - AcMeshInfo submesh_info = info; - const int3 decomposition = decompose(nprocs); - const int3 pid3d = getPid3D(pid, decomposition); - - printf("Decomposition: %d, %d, %d\n", decomposition.x, decomposition.y, decomposition.z); - printf("Process %d: (%d, %d, %d)\n", pid, pid3d.x, pid3d.y, pid3d.z); - ERRCHK_ALWAYS(info.int_params[AC_nx] % decomposition.x == 0); - ERRCHK_ALWAYS(info.int_params[AC_ny] % decomposition.y == 0); - ERRCHK_ALWAYS(info.int_params[AC_nz] % decomposition.z == 0); - - submesh_info.int_params[AC_nx] = info.int_params[AC_nx] / decomposition.x; - submesh_info.int_params[AC_ny] = info.int_params[AC_ny] / decomposition.y; - submesh_info.int_params[AC_nz] = info.int_params[AC_nz] / decomposition.z; - submesh_info.int3_params[AC_global_grid_n] = (int3){ - info.int_params[AC_nx], - info.int_params[AC_ny], - info.int_params[AC_nz], - }; - submesh_info.int3_params[AC_multigpu_offset] = (int3){-1, -1, -1}; // TODO - WARNING("AC_multigpu_offset not yet implemented"); - acUpdateConfig(&submesh_info); - ERRCHK_ALWAYS(is_valid(submesh_info.real_params[AC_inv_dsx])); - ERRCHK_ALWAYS(is_valid(submesh_info.real_params[AC_cs2_sound])); - // - - AcMesh submesh; - acMeshCreate(submesh_info, &submesh); - acMeshRandomize(&submesh); - -#if BENCH_STRONG_SCALING - MPI_Barrier(MPI_COMM_WORLD); - acDeviceDistributeMeshMPI(model, decomposition, &submesh); - MPI_Barrier(MPI_COMM_WORLD); -#endif - - // Init the GPU - int devices_per_node = -1; - cudaGetDeviceCount(&devices_per_node); - - Device device; - acDeviceCreate(pid % devices_per_node, submesh_info, &device); - acDeviceLoadMesh(device, STREAM_DEFAULT, submesh); - - MPI_Barrier(MPI_COMM_WORLD); - // Attempt to enable peer access with all neighbors in the node - for (int i = 0; i < devices_per_node; ++i) { - cudaSetDevice(device->id); - // WARNCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(i, 0)); // TODO RE-ENABLE - } - /* - // Attempt to enable peer access to the most expensive neighbors (sides) - const int left = getPid(pid3d + (int3){-1, 0, 0}, decomposition); - const int right = getPid(pid3d + (int3){1, 0, 0}, decomposition); - const int up = getPid(pid3d + (int3){0, 1, 0}, decomposition); - const int down = getPid(pid3d + (int3){0, -1, 0}, decomposition); - const int front = getPid(pid3d + (int3){0, 0, 1}, decomposition); - const int back = getPid(pid3d + (int3){0, 0, -1}, decomposition); - - cudaSetDevice(device->id); - WARNCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(left, 0)); - WARNCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(right, 0)); - WARNCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(up, 0)); - WARNCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(down, 0)); - WARNCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(front, 0)); - WARNCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(back, 0)); - */ - -// Verification start /////////////////////////////////////////////////////////////////////// -#if BENCH_STRONG_SCALING - { - // const float dt = FLT_EPSILON; // TODO - // acDeviceIntegrateStepMPI(device, dt); // TODO - // acDeviceBoundStepMPI(device); TODO - acDeviceSynchronizeStream(device, STREAM_ALL); - MPI_Barrier(MPI_COMM_WORLD); - acDeviceCommunicateHalosMPI(device); - acDeviceSynchronizeStream(device, STREAM_ALL); - MPI_Barrier(MPI_COMM_WORLD); - - acDeviceStoreMesh(device, STREAM_DEFAULT, &submesh); - acDeviceSynchronizeStream(device, STREAM_DEFAULT); - acDeviceGatherMeshMPI(submesh, decomposition, &candidate); - if (pid == 0) { - // acModelIntegrateStep(model, FLT_EPSILON); // TODO - acMeshApplyPeriodicBounds(&model); - - const bool valid = acVerifyMesh(model, candidate); - acMeshDestroy(&model); - acMeshDestroy(&candidate); - - // Write out - char buf[256]; - sprintf(buf, "nprocs_%d_result_%s", nprocs, valid ? "valid" : "INVALID_CHECK_OUTPUT"); - FILE* fp = fopen(buf, "w"); - ERRCHK_ALWAYS(fp); - fclose(fp); - } - } -#endif - // Verification end /////////////////////////////////////////////////////////////////////// - - /* - // TODO - // Warmup - for (int i = 0; i < 10; ++i) - acDeviceIntegrateStepMPI(device, 0); - - acDeviceSynchronizeStream(device, STREAM_ALL); - MPI_Barrier(MPI_COMM_WORLD); - - // Benchmark start /////////////////////////////////////////////////////////////////////// - std::vector results; - results.reserve(num_iters); - - Timer total_time; - timer_reset(&total_time); - - Timer step_time; - const int num_iters = 10; - for (int i = 0; i < num_iters; ++i) { - timer_reset(&step_time); - - const AcReal dt = FLT_EPSILON; - acDeviceIntegrateStepMPI(device, dt); - acDeviceSynchronizeStream(device, STREAM_ALL); - MPI_Barrier(MPI_COMM_WORLD); - - results.push_back(timer_diff_nsec(step_time) / 1e6); - } - - const double ms_elapsed = timer_diff_nsec(total_time) / 1e6; - const double nth_percentile = 0.90; - std::sort(results.begin(), results.end(), - [](const double& a, const double& b) { return a < b; }); - - if (pid == 0) { - printf("vertices: %d^3, iterations: %d\n", nn, num_iters); - printf("Total time: %f ms\n", ms_elapsed); - printf("Time per step: %f ms\n", ms_elapsed / num_iters); - - const size_t nth_index = int(nth_percentile * num_iters); - printf("%dth percentile per step: %f ms\n", int(100 * nth_percentile), - results[nth_index]); - - // Write out - char buf[256]; - sprintf(buf, "nprocs_%d_result_%s.bench", nprocs, BENCH_STRONG_SCALING ? "strong" : - "weak"); FILE* fp = fopen(buf, "w"); ERRCHK_ALWAYS(fp); fprintf(fp, "nprocs, percentile - (%dth)\n", int(100 * nth_percentile)); fprintf(fp, "%d, %g\n", nprocs, results[nth_index]); - fclose(fp); - } - // Benchmark end /////////////////////////////////////////////////////////////////////// - */ - - // Finalize - acDeviceDestroy(device); - acMeshDestroy(&submesh); - MPI_Finalize(); - return AC_SUCCESS; -} - -/* -// Working, 1D decomposition -AcResult -acDeviceRunMPITest(void) -{ - // If 1, runs the strong scaling tests and verification. - // Verification is disabled when benchmarking weak scaling because the - // whole grid is too large to fit into memory. -#define BENCH_STRONG_SCALING (0) - - MPI_Init(NULL, NULL); - - int num_processes, pid; - MPI_Comm_size(MPI_COMM_WORLD, &num_processes); - MPI_Comm_rank(MPI_COMM_WORLD, &pid); - - char processor_name[MPI_MAX_PROCESSOR_NAME]; - int name_len; - MPI_Get_processor_name(processor_name, &name_len); - printf("Processor %s. Process %d of %d.\n", processor_name, pid, num_processes); - - // Create model and candidate meshes - AcMeshInfo info; - acLoadConfig(AC_DEFAULT_CONFIG, &info); - - // Large mesh dim - const int nn = 256; - const int num_iters = 10; - info.int_params[AC_nx] = info.int_params[AC_ny] = nn; - info.int_params[AC_nz] = BENCH_STRONG_SCALING ? nn : nn * num_processes; - info.real_params[AC_inv_dsx] = AcReal(1.0) / info.real_params[AC_dsx]; - info.real_params[AC_inv_dsy] = AcReal(1.0) / info.real_params[AC_dsy]; - info.real_params[AC_inv_dsz] = AcReal(1.0) / info.real_params[AC_dsz]; - info.real_params[AC_cs2_sound] = info.real_params[AC_cs_sound] * -info.real_params[AC_cs_sound]; acUpdateConfig(&info); acPrintMeshInfo(info); - ERRCHK_ALWAYS(info.int_params[AC_nz] % num_processes == 0); - -#if BENCH_STRONG_SCALING - AcMesh model, candidate; - - // Master CPU - if (pid == 0) { - acMeshCreate(info, &model); - acMeshCreate(info, &candidate); - - acMeshRandomize(&model); - acMeshRandomize(&candidate); - } -#endif - - /// DECOMPOSITION - AcMeshInfo submesh_info = info; - const int submesh_nz = info.int_params[AC_nz] / num_processes; - submesh_info.int_params[AC_nz] = submesh_nz; - submesh_info.int3_params[AC_global_grid_n] = (int3){ - info.int_params[AC_nx], - info.int_params[AC_ny], - info.int_params[AC_nz], - }; - submesh_info.int3_params[AC_multigpu_offset] = (int3){0, 0, pid * submesh_nz}; - acUpdateConfig(&submesh_info); - ERRCHK_ALWAYS(is_valid(submesh_info.real_params[AC_inv_dsx])); - ERRCHK_ALWAYS(is_valid(submesh_info.real_params[AC_cs2_sound])); - // - - AcMesh submesh; - acMeshCreate(submesh_info, &submesh); - acMeshRandomize(&submesh); - -#if BENCH_STRONG_SCALING - acDeviceDistributeMeshMPI(model, &submesh); -#endif - - // Init the GPU - int devices_per_node = -1; - cudaGetDeviceCount(&devices_per_node); - - Device device; - acDeviceCreate(pid % devices_per_node, submesh_info, &device); - acDeviceLoadMesh(device, STREAM_DEFAULT, submesh); - - // Enable peer access - MPI_Barrier(MPI_COMM_WORLD); - const int front = (device->id + 1) % devices_per_node; - const int back = (device->id + devices_per_node - 1) % devices_per_node; - cudaSetDevice(device->id); - WARNCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(front, 0)); - WARNCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(back, 0)); - -// Verification start /////////////////////////////////////////////////////////////////////// -#if BENCH_STRONG_SCALING - { - const float dt = FLT_EPSILON; - acDeviceIntegrateStepMPI(device, dt); - acDeviceBoundStepMPI(device); - acDeviceSynchronizeStream(device, STREAM_ALL); - - acDeviceStoreMesh(device, STREAM_DEFAULT, &submesh); - acDeviceGatherMeshMPI(submesh, &candidate); - if (pid == 0) { - acModelIntegrateStep(model, FLT_EPSILON); - acMeshApplyPeriodicBounds(&model); - - const bool valid = acVerifyMesh(model, candidate); - acMeshDestroy(&model); - acMeshDestroy(&candidate); - - // Write out - char buf[256]; - sprintf(buf, "nprocs_%d_result_%s", num_processes, - valid ? "valid" : "INVALID_CHECK_OUTPUT"); - FILE* fp = fopen(buf, "w"); - ERRCHK_ALWAYS(fp); - fclose(fp); - } - } -#endif - // Verification end /////////////////////////////////////////////////////////////////////// - - // Warmup - for (int i = 0; i < 10; ++i) - acDeviceIntegrateStepMPI(device, 0); - - acDeviceSynchronizeStream(device, STREAM_ALL); - MPI_Barrier(MPI_COMM_WORLD); - - // Benchmark start /////////////////////////////////////////////////////////////////////// - std::vector results; - results.reserve(num_iters); - - Timer total_time; - timer_reset(&total_time); - - Timer step_time; - for (int i = 0; i < num_iters; ++i) { - timer_reset(&step_time); - - const AcReal dt = FLT_EPSILON; - acDeviceIntegrateStepMPI(device, dt); - acDeviceSynchronizeStream(device, STREAM_ALL); - MPI_Barrier(MPI_COMM_WORLD); - - results.push_back(timer_diff_nsec(step_time) / 1e6); - } - - const double ms_elapsed = timer_diff_nsec(total_time) / 1e6; - const double nth_percentile = 0.90; - std::sort(results.begin(), results.end(), - [](const double& a, const double& b) { return a < b; }); - - if (pid == 0) { - printf("vertices: %d^3, iterations: %d\n", nn, num_iters); - printf("Total time: %f ms\n", ms_elapsed); - printf("Time per step: %f ms\n", ms_elapsed / num_iters); - - const size_t nth_index = int(nth_percentile * num_iters); - printf("%dth percentile per step: %f ms\n", int(100 * nth_percentile), -results[nth_index]); - - // Write out - char buf[256]; - sprintf(buf, "nprocs_%d_result_%s.bench", num_processes, - BENCH_STRONG_SCALING ? "strong" : "weak"); - FILE* fp = fopen(buf, "w"); - ERRCHK_ALWAYS(fp); - fprintf(fp, "num_processes, percentile (%dth)\n", int(100 * nth_percentile)); - fprintf(fp, "%d, %g\n", num_processes, results[nth_index]); - fclose(fp); - } - // Benchmark end /////////////////////////////////////////////////////////////////////// - - // Finalize - acDeviceDestroy(device); - acMeshDestroy(&submesh); - MPI_Finalize(); - return AC_SUCCESS; -} -*/ - -#else -AcResult -acDeviceRunMPITest(void) -{ - WARNING("MPI was not enabled but acDeviceRunMPITest() was called"); - return AC_FAILURE; -} -#endif - -#if PACKED_DATA_TRANSFERS // DEPRECATED, see AC_MPI_ENABLED instead +#if PACKED_DATA_TRANSFERS // TODO DEPRECATED, see AC_MPI_ENABLED instead // Functions for calling packed data transfers #endif diff --git a/src/core/kernels/packing.cu b/src/core/kernels/packing.cu index 0a5dca8..c2c8353 100644 --- a/src/core/kernels/packing.cu +++ b/src/core/kernels/packing.cu @@ -28,93 +28,6 @@ #include "src/core/errchk.h" -/* -__global__ void -kernel_pack_data(const AcReal* unpacked, const int3 unpacked_start, const int3 packed_dimensions, - AcReal* packed) -{ - const int i_packed = threadIdx.x + blockIdx.x * blockDim.x; - const int j_packed = threadIdx.y + blockIdx.y * blockDim.y; - const int k_packed = threadIdx.z + blockIdx.z * blockDim.z; - - // If within the start-end range (this allows threadblock dims that are not - // divisible by end - start) - if (i_packed >= packed_dimensions.x || // - j_packed >= packed_dimensions.y || // - k_packed >= packed_dimensions.z) { - return; - } - - const int i_unpacked = i_packed + unpacked_start.x; - const int j_unpacked = j_packed + unpacked_start.y; - const int k_unpacked = k_packed + unpacked_start.z; - - const int unpacked_idx = DEVICE_VTXBUF_IDX(i_unpacked, j_unpacked, k_unpacked); - const int packed_idx = i_packed + // - j_packed * packed_dimensions.x + // - k_packed * packed_dimensions.x * packed_dimensions.y; - - packed[packed_idx] = unpacked[unpacked_idx]; -} - -__global__ void -kernel_unpack_data(const AcReal* packed, const int3 packed_dimensions, const int3 unpacked_start, - AcReal* unpacked) -{ - const int i_packed = threadIdx.x + blockIdx.x * blockDim.x; - const int j_packed = threadIdx.y + blockIdx.y * blockDim.y; - const int k_packed = threadIdx.z + blockIdx.z * blockDim.z; - - // If within the start-end range (this allows threadblock dims that are not - // divisible by end - start) - if (i_packed >= packed_dimensions.x || // - j_packed >= packed_dimensions.y || // - k_packed >= packed_dimensions.z) { - return; - } - - const int i_unpacked = i_packed + unpacked_start.x; - const int j_unpacked = j_packed + unpacked_start.y; - const int k_unpacked = k_packed + unpacked_start.z; - - const int unpacked_idx = DEVICE_VTXBUF_IDX(i_unpacked, j_unpacked, k_unpacked); - const int packed_idx = i_packed + // - j_packed * packed_dimensions.x + // - k_packed * packed_dimensions.x * packed_dimensions.y; - - unpacked[unpacked_idx] = packed[packed_idx]; -} - -AcResult -acKernelPackData(const cudaStream_t stream, const AcReal* unpacked, const int3 unpacked_start, - const int3 packed_dimensions, AcReal* packed) -{ - const dim3 tpb(32, 8, 1); - const dim3 bpg((unsigned int)ceil(packed_dimensions.x / (float)tpb.x), - (unsigned int)ceil(packed_dimensions.y / (float)tpb.y), - (unsigned int)ceil(packed_dimensions.z / (float)tpb.z)); - - kernel_pack_data<<>>(unpacked, unpacked_start, packed_dimensions, packed); - ERRCHK_CUDA_KERNEL_ALWAYS(); // TODO SET W/ DEBUG ONLY - - return AC_SUCCESS; -} - -AcResult -acKernelUnpackData(const cudaStream_t stream, const AcReal* packed, const int3 packed_dimensions, - const int3 unpacked_start, AcReal* unpacked) -{ - const dim3 tpb(32, 8, 1); - const dim3 bpg((unsigned int)ceil(packed_dimensions.x / (float)tpb.x), - (unsigned int)ceil(packed_dimensions.y / (float)tpb.y), - (unsigned int)ceil(packed_dimensions.z / (float)tpb.z)); - - kernel_unpack_data<<>>(packed, packed_dimensions, unpacked_start, - unpacked); - ERRCHK_CUDA_KERNEL_ALWAYS(); // TODO SET W/ DEBUG ONLY - return AC_SUCCESS; -}*/ - __global__ void kernel_pack_data(const VertexBufferArray vba, const int3 vba_start, PackedData packed) { @@ -201,36 +114,3 @@ acKernelUnpackData(const cudaStream_t stream, const PackedData packed, const int ERRCHK_CUDA_KERNEL_ALWAYS(); // TODO SET W/ DEBUG ONLY return AC_SUCCESS; } - -AcResult -acKernelPackCorner(void) -{ - return AC_FAILURE; -} -AcResult -acKernelUnpackCorner(void) -{ - return AC_FAILURE; -} - -AcResult -acKernelPackEdge(void) -{ - return AC_FAILURE; -} -AcResult -acKernelUnpackEdge(void) -{ - return AC_FAILURE; -} - -AcResult -acKernelPackSide(void) -{ - return AC_FAILURE; -} -AcResult -acKernelUnpackSide(void) -{ - return AC_FAILURE; -} diff --git a/src/core/kernels/packing.cuh b/src/core/kernels/packing.cuh index 7b86183..f425081 100644 --- a/src/core/kernels/packing.cuh +++ b/src/core/kernels/packing.cuh @@ -28,15 +28,6 @@ #include "astaroth.h" #include "common.cuh" -/* -AcResult acKernelPackData(const cudaStream_t stream, const AcReal* unpacked, - const int3 unpacked_start, const int3 packed_dimensions, AcReal* packed); - -AcResult acKernelUnpackData(const cudaStream_t stream, const AcReal* packed, - const int3 packed_dimensions, const int3 unpacked_start, - AcReal* unpacked); -*/ - typedef struct { int3 dims; AcReal* data; @@ -47,12 +38,3 @@ AcResult acKernelPackData(const cudaStream_t stream, const VertexBufferArray vba AcResult acKernelUnpackData(const cudaStream_t stream, const PackedData packed, const int3 vba_start, VertexBufferArray vba); - -AcResult acKernelPackCorner(void); -AcResult acKernelUnpackCorner(void); - -AcResult acKernelPackEdge(void); -AcResult acKernelUnpackEdge(void); - -AcResult acKernelPackSide(void); -AcResult acKernelUnpackSide(void);