Some concurrency optimizations for 3D blocking
This commit is contained in:
@@ -690,19 +690,25 @@ acCreatePackedDataHost(const int3 dims)
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
acTransferPackedDataToHost(const PackedData ddata, PackedData* hdata)
|
acTransferPackedDataToHost(const Device device, const cudaStream_t stream, const PackedData ddata,
|
||||||
|
PackedData* hdata)
|
||||||
{
|
{
|
||||||
|
cudaSetDevice(device->id);
|
||||||
|
|
||||||
const size_t bytes = ddata.dims.x * ddata.dims.y * ddata.dims.z * sizeof(ddata.data[0]) *
|
const size_t bytes = ddata.dims.x * ddata.dims.y * ddata.dims.z * sizeof(ddata.data[0]) *
|
||||||
NUM_VTXBUF_HANDLES;
|
NUM_VTXBUF_HANDLES;
|
||||||
ERRCHK_CUDA_ALWAYS(cudaMemcpy(hdata->data, ddata.data, bytes, cudaMemcpyDeviceToHost));
|
ERRCHK_CUDA(cudaMemcpyAsync(hdata->data, ddata.data, bytes, cudaMemcpyDeviceToHost, stream));
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
acTransferPackedDataToDevice(const PackedData hdata, PackedData* ddata)
|
acTransferPackedDataToDevice(const Device device, const cudaStream_t stream, const PackedData hdata,
|
||||||
|
PackedData* ddata)
|
||||||
{
|
{
|
||||||
|
cudaSetDevice(device->id);
|
||||||
|
|
||||||
const size_t bytes = hdata.dims.x * hdata.dims.y * hdata.dims.z * sizeof(hdata.data[0]) *
|
const size_t bytes = hdata.dims.x * hdata.dims.y * hdata.dims.z * sizeof(hdata.data[0]) *
|
||||||
NUM_VTXBUF_HANDLES;
|
NUM_VTXBUF_HANDLES;
|
||||||
ERRCHK_CUDA_ALWAYS(cudaMemcpy(ddata->data, hdata.data, bytes, cudaMemcpyHostToDevice));
|
ERRCHK_CUDA(cudaMemcpyAsync(ddata->data, hdata.data, bytes, cudaMemcpyHostToDevice, stream));
|
||||||
}
|
}
|
||||||
|
|
||||||
static AcResult
|
static AcResult
|
||||||
@@ -930,7 +936,8 @@ static AcResult acDeviceCommunicateBlocksMPI(const Device device, //
|
|||||||
// Host ////////////////////////////////////////////////
|
// Host ////////////////////////////////////////////////
|
||||||
PackedData src_host = acCreatePackedDataHost(dims);
|
PackedData src_host = acCreatePackedDataHost(dims);
|
||||||
PackedData dst_host = acCreatePackedDataHost(dims);
|
PackedData dst_host = acCreatePackedDataHost(dims);
|
||||||
acTransferPackedDataToHost(src, &src_host);
|
acTransferPackedDataToHost(device, device->streams[STREAM_DEFAULT], src,
|
||||||
|
&src_host);
|
||||||
acDeviceSynchronizeStream(device, STREAM_ALL);
|
acDeviceSynchronizeStream(device, STREAM_ALL);
|
||||||
MPI_Barrier(MPI_COMM_WORLD);
|
MPI_Barrier(MPI_COMM_WORLD);
|
||||||
////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////
|
||||||
@@ -948,7 +955,8 @@ static AcResult acDeviceCommunicateBlocksMPI(const Device device, //
|
|||||||
MPI_Wait(&recv_req, MPI_STATUS_IGNORE);
|
MPI_Wait(&recv_req, MPI_STATUS_IGNORE);
|
||||||
|
|
||||||
// Host ////////////////////////////////////////////////
|
// Host ////////////////////////////////////////////////
|
||||||
acTransferPackedDataToDevice(dst_host, &dst);
|
acTransferPackedDataToDevice(device, device->streams[STREAM_DEFAULT],
|
||||||
|
dst_host, &dst);
|
||||||
acDeviceSynchronizeStream(device, STREAM_ALL);
|
acDeviceSynchronizeStream(device, STREAM_ALL);
|
||||||
acDestroyPackedDataHost(&src_host);
|
acDestroyPackedDataHost(&src_host);
|
||||||
acDestroyPackedDataHost(&dst_host);
|
acDestroyPackedDataHost(&dst_host);
|
||||||
@@ -1187,33 +1195,38 @@ static void
|
|||||||
acPackCommData(const Device device, const int3* a0s, const size_t count, CommData* data)
|
acPackCommData(const Device device, const int3* a0s, const size_t count, CommData* data)
|
||||||
{
|
{
|
||||||
cudaSetDevice(device->id);
|
cudaSetDevice(device->id);
|
||||||
cudaStream_t stream = device->streams[STREAM_DEFAULT];
|
|
||||||
|
cudaStream_t streams[count];
|
||||||
|
for (size_t i = 0; i < count; ++i)
|
||||||
|
cudaStreamCreate(&streams[i]);
|
||||||
|
|
||||||
for (size_t i = 0; i < count; ++i)
|
for (size_t i = 0; i < count; ++i)
|
||||||
acKernelPackData(stream, device->vba, a0s[i], data->srcs[i]);
|
acKernelPackData(streams[i], device->vba, a0s[i], data->srcs[i]);
|
||||||
|
|
||||||
// HOST
|
|
||||||
acDeviceSynchronizeStream(device, STREAM_DEFAULT);
|
|
||||||
for (size_t i = 0; i < count; ++i)
|
for (size_t i = 0; i < count; ++i)
|
||||||
acTransferPackedDataToHost(data->srcs[i], &data->srcs_host[i]);
|
acTransferPackedDataToHost(device, streams[i], data->srcs[i], &data->srcs_host[i]);
|
||||||
//////
|
|
||||||
|
for (size_t i = 0; i < count; ++i)
|
||||||
|
cudaStreamDestroy(streams[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
acUnpackCommData(const Device device, const int3* b0s, const size_t count, CommData* data)
|
acUnpackCommData(const Device device, const int3* b0s, const size_t count, CommData* data)
|
||||||
{
|
{
|
||||||
cudaSetDevice(device->id);
|
cudaSetDevice(device->id);
|
||||||
cudaStream_t stream = device->streams[STREAM_DEFAULT];
|
|
||||||
|
|
||||||
// HOST
|
cudaStream_t streams[count];
|
||||||
for (size_t i = 0; i < count; ++i)
|
for (size_t i = 0; i < count; ++i)
|
||||||
acTransferPackedDataToDevice(data->dsts_host[i], &data->dsts[i]);
|
cudaStreamCreate(&streams[i]);
|
||||||
|
|
||||||
acDeviceSynchronizeStream(device, STREAM_ALL);
|
|
||||||
//////
|
|
||||||
|
|
||||||
for (size_t i = 0; i < count; ++i)
|
for (size_t i = 0; i < count; ++i)
|
||||||
acKernelUnpackData(stream, data->dsts[i], b0s[i], device->vba);
|
acTransferPackedDataToDevice(device, streams[i], data->dsts_host[i], &data->dsts[i]);
|
||||||
|
|
||||||
|
for (size_t i = 0; i < count; ++i)
|
||||||
|
acKernelUnpackData(streams[i], data->dsts[i], b0s[i], device->vba);
|
||||||
|
|
||||||
|
for (size_t i = 0; i < count; ++i)
|
||||||
|
cudaStreamDestroy(streams[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
static AcResult
|
static AcResult
|
||||||
|
Reference in New Issue
Block a user