diff --git a/src/core/device.cu b/src/core/device.cu index ffcb9be..849672c 100644 --- a/src/core/device.cu +++ b/src/core/device.cu @@ -180,21 +180,21 @@ acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_hand // VBA in/out const size_t vba_size_bytes = acVertexBufferSizeBytes(device_config); for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { - ERRCHK_CUDA_ALWAYS(cudaMallocManaged(&device->vba.in[i], vba_size_bytes)); - ERRCHK_CUDA_ALWAYS(cudaMallocManaged(&device->vba.out[i], vba_size_bytes)); + ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.in[i], vba_size_bytes)); + ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.out[i], vba_size_bytes)); } // VBA Profiles const size_t profile_size_bytes = sizeof(AcReal) * max(device_config.int_params[AC_mx], max(device_config.int_params[AC_my], device_config.int_params[AC_mz])); for (int i = 0; i < NUM_SCALARARRAY_HANDLES; ++i) { - ERRCHK_CUDA_ALWAYS(cudaMallocManaged(&device->vba.profiles[i], profile_size_bytes)); + ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.profiles[i], profile_size_bytes)); } // Reductions - ERRCHK_CUDA_ALWAYS(cudaMallocManaged(&device->reduce_scratchpad, + ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_scratchpad, acVertexBufferCompdomainSizeBytes(device_config))); - ERRCHK_CUDA_ALWAYS(cudaMallocManaged(&device->reduce_result, sizeof(AcReal))); + ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_result, sizeof(AcReal))); #if PACKED_DATA_TRANSFERS // Allocate data required for packed transfers here (cudaMalloc) @@ -862,7 +862,63 @@ acDeviceGatherMeshMPI(const AcMesh src, AcMesh* dst) static AcResult acDeviceCommunicateHalosMPI(const Device device) { - MPI_Barrier(MPI_COMM_WORLD); + //MPI_Barrier(MPI_COMM_WORLD); + MPI_Datatype datatype = MPI_FLOAT; + if (sizeof(AcReal) == 8) + datatype = MPI_DOUBLE; + + int pid, num_processes; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); + MPI_Comm_size(MPI_COMM_WORLD, &num_processes); + + const size_t count = device->local_config.int_params[AC_mx] * + device->local_config.int_params[AC_my] * NGHOST; + MPI_Request recv_requests[2*NUM_VTXBUF_HANDLES]; + for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { + { // Front + // ...|ooooxxx|... -> xxx|ooooooo|... + const size_t src_idx = acVertexBufferIdx(0, 0, device->local_config.int_params[AC_nz], + device->local_config); + const size_t dst_idx = acVertexBufferIdx(0, 0, 0, device->local_config); + const int send_pid = (pid + 1) % num_processes; + const int recv_pid = (pid + num_processes - 1) % num_processes; + + MPI_Irecv(&device->vba.in[i][dst_idx], count, datatype, recv_pid, i, MPI_COMM_WORLD, + &recv_requests[i]); + + MPI_Request request; + MPI_Isend(&device->vba.in[i][src_idx], count, datatype, send_pid, i, MPI_COMM_WORLD, + &request); + + } + { // Back + // ...|ooooooo|xxx <- ...|xxxoooo|... + const size_t src_idx = acVertexBufferIdx(0, 0, NGHOST, device->local_config); + const size_t dst_idx = acVertexBufferIdx( + 0, 0, NGHOST + device->local_config.int_params[AC_nz], device->local_config); + const int send_pid = (pid + num_processes - 1) % num_processes; + const int recv_pid = (pid + 1) % num_processes; + + MPI_Irecv(&device->vba.in[i][dst_idx], count, datatype, recv_pid, NUM_VTXBUF_HANDLES + i, + MPI_COMM_WORLD, &recv_requests[NUM_VTXBUF_HANDLES + i]); + + MPI_Request request; + MPI_Isend(&device->vba.in[i][src_idx], count, datatype, send_pid, + NUM_VTXBUF_HANDLES + i, MPI_COMM_WORLD, &request); + } + } + MPI_Status recv_statuses[2*NUM_VTXBUF_HANDLES]; + MPI_Waitall(2*NUM_VTXBUF_HANDLES, recv_requests, recv_statuses); + + return AC_SUCCESS; +} + +#if 0 +/** NOTE: Assumes 1 process per GPU */ +static AcResult +acDeviceCommunicateHalosMPI(const Device device) +{ + //MPI_Barrier(MPI_COMM_WORLD); MPI_Datatype datatype = MPI_FLOAT; if (sizeof(AcReal) == 8) datatype = MPI_DOUBLE; @@ -877,6 +933,7 @@ acDeviceCommunicateHalosMPI(const Device device) MPI_Status status_front[NUM_VTXBUF_HANDLES]; MPI_Request request_back[NUM_VTXBUF_HANDLES]; MPI_Status status_back[NUM_VTXBUF_HANDLES]; + //#pragma omp parallel for for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { { // Front // ...|ooooxxx|... -> xxx|ooooooo|... @@ -900,6 +957,7 @@ acDeviceCommunicateHalosMPI(const Device device) NUM_VTXBUF_HANDLES + i, MPI_COMM_WORLD, &request_back[i]); } } + //#pragma omp parallel for for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { { // Front // ...|ooooxxx|... -> xxx|ooooooo|... @@ -921,6 +979,7 @@ acDeviceCommunicateHalosMPI(const Device device) MPI_COMM_WORLD, &status_back[i]); } } + //#pragma omp parallel for for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { MPI_Wait(&request_front[i], &status_front[i]); MPI_Wait(&request_back[i], &status_back[i]); @@ -968,6 +1027,7 @@ acDeviceCommunicateHalosMPI(const Device device) */ return AC_SUCCESS; } +#endif static void acHostCommunicateHalosMPI(AcMesh* submesh) @@ -1036,6 +1096,9 @@ acDeviceRunMPITest(void) { int num_processes, pid; MPI_Init(NULL, NULL); + //int provided; + //MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, &provided); // Hybrid MP + MPI + //ERRCHK_ALWAYS(provided == MPI_THREAD_MULTIPLE); MPI_Comm_size(MPI_COMM_WORLD, &num_processes); MPI_Comm_rank(MPI_COMM_WORLD, &pid); @@ -1067,12 +1130,18 @@ acDeviceRunMPITest(void) #endif /* MPIX_CUDA_AWARE_SUPPORT */ //////// Borrowing end + int direct = getenv("MPICH_RDMA_ENABLED_CUDA")==NULL?0:atoi(getenv ("MPICH_RDMA_ENABLED_CUDA")); + if(direct != 1){ + printf ("MPICH_RDMA_ENABLED_CUDA not enabled!\n"); + exit (EXIT_FAILURE); + } // Create model and candidate meshes AcMeshInfo info; acLoadConfig(AC_DEFAULT_CONFIG, &info); const int nn = 256; - info.int_params[AC_nx] = info.int_params[AC_ny] = info.int_params[AC_nz] = nn; + info.int_params[AC_nx] = info.int_params[AC_ny] = nn; + info.int_params[AC_nz] = 4*512; acUpdateConfig(&info); AcMesh model, candidate; @@ -1118,6 +1187,21 @@ acDeviceRunMPITest(void) acDeviceCreate(0, submesh_info, &device); acDeviceLoadMesh(device, STREAM_DEFAULT, submesh); + + // Warmup + acDeviceSynchronizeStream(device, STREAM_ALL); + for (int i = 0; i < 10; ++i) { + ///// Communication start + { + const int3 start = (int3){0, 0, NGHOST}; + const int3 end = (int3){subgrid_m.x, subgrid_m.y, subgrid_m.z - NGHOST}; + acDevicePeriodicBoundconds(device, STREAM_DEFAULT, start, end); + } + acDeviceSynchronizeStream(device, STREAM_DEFAULT); + // Includes periodic bounds at first and last ghost zone + acDeviceCommunicateHalosMPI(device); + } + ////////////////////////////// Timer start const int num_iters = 100; Timer total_time; @@ -1131,7 +1215,7 @@ acDeviceRunMPITest(void) } #if 1 // GPU-GPU if CUDA-aware MPI, otherwise managed CPU-GPU-GPU-CPU acDeviceSynchronizeStream(device, STREAM_DEFAULT); - MPI_Barrier(MPI_COMM_WORLD); + //MPI_Barrier(MPI_COMM_WORLD); acDeviceCommunicateHalosMPI( device); // Includes periodic bounds at first and last ghost zone MPI_Barrier(MPI_COMM_WORLD); @@ -1146,6 +1230,7 @@ acDeviceRunMPITest(void) const double ms_elapsed = timer_diff_nsec(total_time) / 1e6; printf("vertices: %d^3, iterations: %d\n", nn, num_iters); printf("Total time: %f ms\n", ms_elapsed); + printf("Time per step: %f ms\n", ms_elapsed / num_iters); } ////////////////////////////// Timer end