Multi-GPU optimizations: removed some unnecessary synchronization and divided the calculation of boundary conditions to local and global steps.

This commit is contained in:
jpekkila
2019-07-05 18:21:44 +03:00
parent f1066a2c11
commit 5fdfdeca9e
3 changed files with 52 additions and 2 deletions

View File

@@ -150,7 +150,7 @@ createDevice(const int id, const AcMeshInfo device_config, Device* device_handle
// Concurrency
for (int i = 0; i < NUM_STREAM_TYPES; ++i) {
cudaStreamCreate(&device->streams[i]);
cudaStreamCreateWithFlags(&device->streams[i], cudaStreamNonBlocking);
}
// Memory