diff --git a/src/core/device.cu b/src/core/device.cu index c49982f..6c54881 100644 --- a/src/core/device.cu +++ b/src/core/device.cu @@ -784,7 +784,7 @@ acDeviceReduceVec(const Device device, const Stream stream, const ReductionType // Needs mx * my * NGHOST * NUM_VTXBUF_HANDLES threads static __global__ void -pack_data(VertexBufferArray vba, const int3 start, AcReal* buffer) +pack_data(const VertexBufferArray vba, const int3 src, AcReal* __restrict__ buffer) { const int3 m = (int3){ DCONST(AC_mx), @@ -797,7 +797,7 @@ pack_data(VertexBufferArray vba, const int3 start, AcReal* buffer) if (vertexIdx >= m.x * m.y * NGHOST * NUM_VTXBUF_HANDLES) return; - const int vba_idx = IDX(start) + (vertexIdx % block_size); + const int vba_idx = IDX(src) + (vertexIdx % block_size); const int vba_handle = vertexIdx / block_size; const int buf_idx = vertexIdx; @@ -806,7 +806,7 @@ pack_data(VertexBufferArray vba, const int3 start, AcReal* buffer) } static __global__ void -unpack_data(VertexBufferArray vba, const int3 start, AcReal* buffer) +unpack_data(const AcReal* __restrict__ buffer, VertexBufferArray vba, const int3 dst) { const int3 m = (int3){ DCONST(AC_mx), @@ -819,7 +819,7 @@ unpack_data(VertexBufferArray vba, const int3 start, AcReal* buffer) if (vertexIdx >= m.x * m.y * NGHOST * NUM_VTXBUF_HANDLES) return; - const int vba_idx = IDX(start) + (vertexIdx % block_size); + const int vba_idx = IDX(dst) + (vertexIdx % block_size); const int vba_handle = vertexIdx / block_size; const int buf_idx = vertexIdx; @@ -1661,36 +1661,21 @@ acDeviceRunMPITest(void) MPI_Get_processor_name(processor_name, &name_len); printf("Processor %s. Process %d of %d.\n", processor_name, pid, num_processes); - // Check MPI support - //// Borrowing start (from OpenMPI examples) -#if defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT - printf("This MPI library has CUDA-aware support.\n", MPIX_CUDA_AWARE_SUPPORT); -#elif defined(MPIX_CUDA_AWARE_SUPPORT) && !MPIX_CUDA_AWARE_SUPPORT - printf("This MPI library does not have CUDA-aware support.\n"); +#ifdef MPIX_CUDA_AWARE_SUPPORT + if (MPIX_Query_cuda_support()) + printf("CUDA-aware MPI supported (MPIX)\n"); + else + WARNING("CUDA-aware MPI not supported with this MPI library (MPIX)\n"); #else - printf("This MPI library cannot determine if there is CUDA-aware support.\n"); -#endif /* MPIX_CUDA_AWARE_SUPPORT */ + printf("MPIX_CUDA_AWARE_SUPPORT was not defined. Do not know wheter CUDA-aware MPI is " + "supported\n"); +#endif - printf("Run time check:\n"); -#if defined(MPIX_CUDA_AWARE_SUPPORT) - if (1 == MPIX_Query_cuda_support()) { - printf("This MPI library has CUDA-aware support.\n"); - } - else { - printf("This MPI library does not have CUDA-aware support.\n"); - } -#else /* !defined(MPIX_CUDA_AWARE_SUPPORT) */ - printf("This MPI library cannot determine if there is CUDA-aware support.\n"); -#endif /* MPIX_CUDA_AWARE_SUPPORT */ - //////// Borrowing end + if (getenv("MPICH_RDMA_ENABLED_CUDA") && atoi(getenv("MPICH_RDMA_ENABLED_CUDA"))) + printf("CUDA-aware MPI supported (MPICH)\n"); + else + WARNING("MPICH not used or this MPI library does not support CUDA-aware MPI\n"); - int direct = getenv("MPICH_RDMA_ENABLED_CUDA") == NULL - ? 0 - : atoi(getenv("MPICH_RDMA_ENABLED_CUDA")); - if (direct != 1) { - printf("MPICH_RDMA_ENABLED_CUDA not enabled!\n"); - exit(EXIT_FAILURE); - } // Create model and candidate meshes AcMeshInfo info; acLoadConfig(AC_DEFAULT_CONFIG, &info); @@ -1756,6 +1741,14 @@ acDeviceRunMPITest(void) printf("Time per step: %f ms\n", ms_elapsed / num_iters); } ////////////////////////////// Timer end + const cudaStream_t default_stream = device->streams[STREAM_DEFAULT]; + const int block_size = submesh_info.int_params[AC_mx] * submesh_info.int_params[AC_my] * NGHOST; + const dim3 tpb(256, 1, 1); + const dim3 bpg((uint)ceil((block_size * NUM_VTXBUF_HANDLES) / (float)tpb.x), 1, 1); + const int3 src = (int3){0, 0, 0}; // TODO + const int3 dst = (int3){0, 0, 0}; // TODO + pack_data<<>>(device->vba, src, device->inner[0]); + unpack_data<<>>(device->inner[0], device->vba, dst); acDeviceStoreMesh(device, STREAM_DEFAULT, &submesh); acDeviceDestroy(device);