From c93b3265e6eb66599e5f9606cdc5a4cbb98a0acf Mon Sep 17 00:00:00 2001 From: jpekkila Date: Wed, 22 Apr 2020 17:03:53 +0300 Subject: [PATCH] Made comm streams high prio --- src/core/device.cc | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/core/device.cc b/src/core/device.cc index 4a81246..7a75496 100644 --- a/src/core/device.cc +++ b/src/core/device.cc @@ -576,6 +576,7 @@ acCreatePackedData(const int3 dims) #if AC_MPI_RT_PINNING ERRCHK_CUDA_ALWAYS(cudaMallocHost((void**)&data.data_pinned, bytes)); + //ERRCHK_CUDA_ALWAYS(cudaMallocManaged((void**)&data.data_pinned, bytes)); // Significantly slower than pinned (38 ms vs. 125 ms) #endif // AC_MPI_RT_PINNING #if AC_MPI_UNIDIRECTIONAL_COMM @@ -674,7 +675,7 @@ acPinPackedData(const Device device, const cudaStream_t stream, PackedData* ddat const size_t bytes = ddata->dims.x * ddata->dims.y * ddata->dims.z * sizeof(ddata->data[0]) * NUM_VTXBUF_HANDLES; ERRCHK_CUDA( - cudaMemcpyAsync(ddata->data_pinned, ddata->data, bytes, cudaMemcpyDeviceToHost, stream)); + cudaMemcpyAsync(ddata->data_pinned, ddata->data, bytes, cudaMemcpyDefault, stream)); } static void @@ -690,7 +691,7 @@ acUnpinPackedData(const Device device, const cudaStream_t stream, PackedData* dd const size_t bytes = ddata->dims.x * ddata->dims.y * ddata->dims.z * sizeof(ddata->data[0]) * NUM_VTXBUF_HANDLES; ERRCHK_CUDA( - cudaMemcpyAsync(ddata->data, ddata->data_pinned, bytes, cudaMemcpyHostToDevice, stream)); + cudaMemcpyAsync(ddata->data, ddata->data_pinned, bytes, cudaMemcpyDefault, stream)); } #endif // AC_MPI_RT_PINNING @@ -906,7 +907,9 @@ acCreateCommData(const Device device, const int3 dims, const size_t count) data.dsts_host[i] = acCreatePackedDataHost(dims); #endif - cudaStreamCreate(&data.streams[i]); + int low_prio, high_prio; + cudaDeviceGetStreamPriorityRange(&low_prio, &high_prio); + cudaStreamCreateWithPriority(&data.streams[i], cudaStreamNonBlocking, high_prio); } return data;