Simplified the optimized multi-GPU integration function

2019-08-07 18:17:03 +03:00
parent fd94b6321d
commit c2bd5ae3e6
1 changed files with 35 additions and 70 deletions
--- a/src/core/node.cu
+++ b/src/core/node.cu
@@ -545,8 +545,8 @@ acNodeIntegrate(const Node node, const AcReal dt)
    //   n0   n1        n2  n3
    const int3 n0 = (int3){NGHOST, NGHOST, NGHOST};
    const int3 n1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
-    const int3 n2 = node->grid.n;
+    // const int3 n2 = node->grid.n;
-    const int3 n3 = n0 + node->grid.n;
+    // const int3 n3 = n0 + node->grid.n;
    for (int isubstep = 0; isubstep < 3; ++isubstep) {
        acNodeSynchronizeStream(node, STREAM_ALL);
@@ -556,85 +556,50 @@ acNodeIntegrate(const Node node, const AcReal dt)
        acNodeSynchronizeStream(node, STREAM_ALL);
        // Inner inner
        for (int i = 0; i < node->num_devices; ++i) {
-            const int3 m1 = n1 + (int3){0, 0, i * node->subgrid.n.z};
+            const int3 m1 = n1;
-            const int3 m2 = m1 + node->subgrid.n - (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
+            const int3 m2 = node->subgrid.n;
-            acNodeIntegrateSubstep(node, STREAM_16, isubstep, m1, m2, dt);
+            acDeviceIntegrateSubstep(node->devices[i], STREAM_16, isubstep, m1, m2, dt);
        }
        for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
-            const int num_vertices = node->subgrid.m.x * node->subgrid.m.y * NGHOST;
+            acNodeSynchronizeVertexBuffer(node, (Stream)vtxbuf, (VertexBufferHandle)vtxbuf);
-            for (int device_id = 0; device_id < node->num_devices; ++device_id) {
+            global_boundcondstep(node, (Stream)vtxbuf, (VertexBufferHandle)vtxbuf);
                // ...|ooooxxx|... -> xxx|ooooooo|...
                {
                    const int3 src = (int3){0, 0, node->subgrid.n.z};
                    const int3 dst = (int3){0, 0, 0};
                    acDeviceTransferVertexBufferWithOffset(
                        node->devices[device_id], (Stream)vtxbuf, (VertexBufferHandle)vtxbuf, src,
                        dst, num_vertices, node->devices[(device_id + 1) % node->num_devices]);
                }
                // ...|ooooooo|xxx <- ...|xxxoooo|...
                {
                    const int3 src = (int3){0, 0, NGHOST};
                    const int3 dst = (int3){0, 0, NGHOST + node->subgrid.n.z};
                    acDeviceTransferVertexBufferWithOffset(
                        node->devices[device_id], (Stream)vtxbuf, (VertexBufferHandle)vtxbuf, src,
                        dst, num_vertices,
                        node->devices[(device_id - 1 + node->num_devices) % node->num_devices]);
                }
            }
        }
-        for (int vtxbuf = 0; vtxbuf < 2 * NUM_VTXBUF_HANDLES; ++vtxbuf) {
+        for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
            acNodeSynchronizeStream(node, (Stream)vtxbuf);
        }
        // Inner outer
        for (int i = 0; i < node->num_devices - 1; ++i) {
            const int3 m1 = n1 + (int3){0, 0, (i + 1) * node->subgrid.n.z - 2 * NGHOST};
            const int3 m2 = m1 + (int3){node->subgrid.n.x - 2 * NGHOST,
                                        node->subgrid.n.y - 2 * NGHOST, 2 * NGHOST};
            acNodeIntegrateSubstep(node, STREAM_0, isubstep, m1, m2, dt);
        }
        // Outer
        // Front
        {
            const int3 m1 = (int3){n0.x, n0.y, n0.z};
            const int3 m2 = (int3){n3.x, n3.y, n1.z};
            acNodeIntegrateSubstep(node, STREAM_1, isubstep, m1, m2, dt);
        }
-        // Back
+        for (int i = 0; i < node->num_devices; ++i) { // Front
-        {
+            const int3 m1 = (int3){NGHOST, NGHOST, NGHOST};
-            const int3 m1 = (int3){n0.x, n0.y, n2.z};
+            const int3 m2 = m1 + (int3){node->subgrid.n.x, node->subgrid.n.y, NGHOST};
-            const int3 m2 = (int3){n3.x, n3.y, n3.z};
+            acDeviceIntegrateSubstep(node->devices[i], STREAM_0, isubstep, m1, m2, dt);
            acNodeIntegrateSubstep(node, STREAM_2, isubstep, m1, m2, dt);
        }
-
+        for (int i = 0; i < node->num_devices; ++i) { // Back
-        // Top
+            const int3 m1 = (int3){NGHOST, NGHOST, node->subgrid.n.z};
-        {
+            const int3 m2 = m1 + (int3){node->subgrid.n.x, node->subgrid.n.y, NGHOST};
-            const int3 m1 = (int3){n0.x, n0.y, n1.z};
+            acDeviceIntegrateSubstep(node->devices[i], STREAM_1, isubstep, m1, m2, dt);
            const int3 m2 = (int3){n3.x, n1.y, n2.z};
            acNodeIntegrateSubstep(node, STREAM_3, isubstep, m1, m2, dt);
        }
-
+        for (int i = 0; i < node->num_devices; ++i) { // Bottom
-        // Bottom
+            const int3 m1 = (int3){NGHOST, NGHOST, 2 * NGHOST};
-        {
+            const int3 m2 = m1 + (int3){node->subgrid.n.x, NGHOST, node->subgrid.n.z - 2 * NGHOST};
-            const int3 m1 = (int3){n0.x, n2.y, n1.z};
+            acDeviceIntegrateSubstep(node->devices[i], STREAM_2, isubstep, m1, m2, dt);
            const int3 m2 = (int3){n3.x, n3.y, n2.z};
            acNodeIntegrateSubstep(node, STREAM_4, isubstep, m1, m2, dt);
        }
-
+        for (int i = 0; i < node->num_devices; ++i) { // Top
-        // Left
+            const int3 m1 = (int3){NGHOST, node->subgrid.n.y, 2 * NGHOST};
-        {
+            const int3 m2 = m1 + (int3){node->subgrid.n.x, NGHOST, node->subgrid.n.z - 2 * NGHOST};
-            const int3 m1 = (int3){n0.x, n1.y, n1.z};
+            acDeviceIntegrateSubstep(node->devices[i], STREAM_3, isubstep, m1, m2, dt);
            const int3 m2 = (int3){n1.x, n2.y, n2.z};
            acNodeIntegrateSubstep(node, STREAM_5, isubstep, m1, m2, dt);
        }
-
+        for (int i = 0; i < node->num_devices; ++i) { // Left
-        // Right
+            const int3 m1 = (int3){NGHOST, 2 * NGHOST, 2 * NGHOST};
-        {
+            const int3 m2 = m1 + (int3){NGHOST, node->subgrid.n.y - 2 * NGHOST,
-            const int3 m1 = (int3){n2.x, n1.y, n1.z};
+                                        node->subgrid.n.z - 2 * NGHOST};
-            const int3 m2 = (int3){n3.x, n2.y, n2.z};
+            acDeviceIntegrateSubstep(node->devices[i], STREAM_4, isubstep, m1, m2, dt);
-            acNodeIntegrateSubstep(node, STREAM_6, isubstep, m1, m2, dt);
+        }
        for (int i = 0; i < node->num_devices; ++i) { // Right
            const int3 m1 = (int3){node->subgrid.n.x, 2 * NGHOST, 2 * NGHOST};
            const int3 m2 = m1 + (int3){NGHOST, node->subgrid.n.y - 2 * NGHOST,
                                        node->subgrid.n.z - 2 * NGHOST};
            acDeviceIntegrateSubstep(node->devices[i], STREAM_5, isubstep, m1, m2, dt);
        }
        acNodeSwapBuffers(node);
    }
    acNodeSynchronizeStream(node, STREAM_ALL);