Multi-GPU optimizations: removed some unnecessary synchronization and divided the calculation of boundary conditions to local and global steps.

2019-07-05 18:21:44 +03:00
parent f1066a2c11
commit 5fdfdeca9e
3 changed files with 52 additions and 2 deletions
--- a/src/core/device.cu
+++ b/src/core/device.cu
@@ -150,7 +150,7 @@ createDevice(const int id, const AcMeshInfo device_config, Device* device_handle

    // Concurrency
    for (int i = 0; i < NUM_STREAM_TYPES; ++i) {
-        cudaStreamCreate(&device->streams[i]);
+        cudaStreamCreateWithFlags(&device->streams[i], cudaStreamNonBlocking);
    }

    // Memory