Autoformatted all CUDA/C/C++ code

2019-06-18 16:42:56 +03:00
parent 6fdc4cddb2
commit 8864266042
12 changed files with 1053 additions and 1111 deletions
--- a/src/core/astaroth.cu
+++ b/src/core/astaroth.cu
@@ -28,33 +28,24 @@
 #include "errchk.h"

 #include "device.cuh"
-#include "math_utils.h" // sum for reductions
+#include "math_utils.h"               // sum for reductions
 #include "standalone/config_loader.h" // update_config

-const char* intparam_names[]      = {AC_FOR_INT_PARAM_TYPES(AC_GEN_STR)};
-const char* realparam_names[]     = {AC_FOR_REAL_PARAM_TYPES(AC_GEN_STR)};
-const char* vtxbuf_names[]        = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR)};
+const char* intparam_names[]  = {AC_FOR_INT_PARAM_TYPES(AC_GEN_STR)};
+const char* realparam_names[] = {AC_FOR_REAL_PARAM_TYPES(AC_GEN_STR)};
+const char* vtxbuf_names[]    = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR)};

-
-static const int MAX_NUM_DEVICES = 32;
-static int num_devices = 1;
+static const int MAX_NUM_DEVICES       = 32;
+static int num_devices                 = 1;
 static Device devices[MAX_NUM_DEVICES] = {};

 static Grid
 createGrid(const AcMeshInfo& config)
 {
    Grid grid;
-    grid.m = (int3) {
-        config.int_params[AC_mx],
-        config.int_params[AC_my],
-        config.int_params[AC_mz]
-    };

-    grid.n = (int3) {
-        config.int_params[AC_nx],
-        config.int_params[AC_ny],
-        config.int_params[AC_nz]
-    };
+    grid.m = (int3){config.int_params[AC_mx], config.int_params[AC_my], config.int_params[AC_mz]};
+    grid.n = (int3){config.int_params[AC_nx], config.int_params[AC_ny], config.int_params[AC_nz]};

    return grid;
 }
@@ -71,8 +62,7 @@ gridIdx(const Grid& grid, const int i, const int j, const int k)
 static int3
 gridIdx3d(const Grid& grid, const int idx)
 {
-    return (int3){idx % grid.m.x,
-                 (idx % (grid.m.x * grid.m.y)) / grid.m.x,
+    return (int3){idx % grid.m.x, (idx % (grid.m.x * grid.m.y)) / grid.m.x,
                  idx / (grid.m.x * grid.m.y)};
 }

@@ -119,10 +109,12 @@ acInit(const AcMeshInfo& config)
    ERRCHK_ALWAYS(subgrid.n.y >= STENCIL_ORDER);
    ERRCHK_ALWAYS(subgrid.n.z >= STENCIL_ORDER);

-    printf("Grid m "); printInt3(grid.m); printf("\n");
-    printf("Grid n "); printInt3(grid.n); printf("\n");
+    // clang-format off
+    printf("Grid m ");   printInt3(grid.m);    printf("\n");
+    printf("Grid n ");   printInt3(grid.n);    printf("\n");
    printf("Subrid m "); printInt3(subgrid.m); printf("\n");
    printf("Subrid n "); printInt3(subgrid.n); printf("\n");
+    // clang-format on

    // Initialize the devices
    for (int i = 0; i < num_devices; ++i) {
@@ -202,8 +194,10 @@ acLoadWithOffset(const AcMesh& host_mesh, const int3& src, const int num_vertice
        */
        if (db.z >= da.z) {
            const int copy_cells = gridIdxx(subgrid, db) - gridIdxx(subgrid, da);
-            const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices}; // DECOMPOSITION OFFSET HERE
-            // printf("\t\tcopy %d cells to local index ", copy_cells); printInt3(da_local); printf("\n");
+            const int3 da_local  = (int3){
+                da.x, da.y, da.z - i * grid.n.z / num_devices}; // DECOMPOSITION OFFSET HERE
+            // printf("\t\tcopy %d cells to local index ", copy_cells); printInt3(da_local);
+            // printf("\n");
            copyMeshToDevice(devices[i], STREAM_PRIMARY, host_mesh, da, da_local, copy_cells);
        }
        printf("\n");
@@ -236,8 +230,10 @@ acStoreWithOffset(const int3& src, const int num_vertices, AcMesh* host_mesh)
        */
        if (db.z >= da.z) {
            const int copy_cells = gridIdxx(subgrid, db) - gridIdxx(subgrid, da);
-            const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices}; // DECOMPOSITION OFFSET HERE
-            // printf("\t\tcopy %d cells from local index ", copy_cells); printInt3(da_local); printf("\n");
+            const int3 da_local  = (int3){
+                da.x, da.y, da.z - i * grid.n.z / num_devices}; // DECOMPOSITION OFFSET HERE
+            // printf("\t\tcopy %d cells from local index ", copy_cells); printInt3(da_local);
+            // printf("\n");
            copyMeshToHost(devices[i], STREAM_PRIMARY, da_local, da, copy_cells, host_mesh);
        }
        printf("\n");
@@ -262,10 +258,9 @@ acStore(AcMesh* host_mesh)
 AcResult
 acIntegrateStep(const int& isubstep, const AcReal& dt)
 {
-    const int3 start = (int3){STENCIL_ORDER/2, STENCIL_ORDER/2, STENCIL_ORDER/2};
-    const int3 end   = (int3){STENCIL_ORDER/2 + subgrid.n.x,
-                              STENCIL_ORDER/2 + subgrid.n.y,
-                              STENCIL_ORDER/2 + subgrid.n.z};
+    const int3 start = (int3){STENCIL_ORDER / 2, STENCIL_ORDER / 2, STENCIL_ORDER / 2};
+    const int3 end   = (int3){STENCIL_ORDER / 2 + subgrid.n.x, STENCIL_ORDER / 2 + subgrid.n.y,
+                            STENCIL_ORDER / 2 + subgrid.n.z};
    for (int i = 0; i < num_devices; ++i) {
        rkStep(devices[i], STREAM_PRIMARY, isubstep, start, end, dt);
    }
@@ -278,121 +273,125 @@ acBoundcondStep(void)
 {
    acSynchronize();
    if (num_devices == 1) {
-        boundcondStep(devices[0], STREAM_PRIMARY,
-                      (int3){0, 0, 0}, (int3){subgrid.m.x, subgrid.m.y, subgrid.m.z});
-    } else {
+        boundcondStep(devices[0], STREAM_PRIMARY, (int3){0, 0, 0},
+                      (int3){subgrid.m.x, subgrid.m.y, subgrid.m.z});
+    }
+    else {
        // Local boundary conditions
        for (int i = 0; i < num_devices; ++i) {
-            const int3 d0 = (int3){0, 0, STENCIL_ORDER/2}; // DECOMPOSITION OFFSET HERE
+            const int3 d0 = (int3){0, 0, STENCIL_ORDER / 2}; // DECOMPOSITION OFFSET HERE
            const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.n.z};
            boundcondStep(devices[i], STREAM_PRIMARY, d0, d1);
        }

-/*
-// ===MIIKKANOTE START==========================================
-%JP: The old way for computing boundary conditions conflicts with the
-way we have to do things with multiple GPUs.
+        /*
+        // ===MIIKKANOTE START==========================================
+        %JP: The old way for computing boundary conditions conflicts with the
+        way we have to do things with multiple GPUs.

-The older approach relied on unified memory, which represented the whole
-memory area as one huge mesh instead of several smaller ones. However, unified memory
-in its current state is more meant for quick prototyping when performance is not an issue.
-Getting the CUDA driver to migrate data intelligently across GPUs is much more difficult than
-when managing the memory explicitly.
+        The older approach relied on unified memory, which represented the whole
+        memory area as one huge mesh instead of several smaller ones. However, unified memory
+        in its current state is more meant for quick prototyping when performance is not an issue.
+        Getting the CUDA driver to migrate data intelligently across GPUs is much more difficult
+        than when managing the memory explicitly.

-In this new approach, I have simplified the multi- and single-GPU layers significantly.
-Quick rundown:
-	New struct: Grid. There are two global variables, "grid" and "subgrid", which
-	contain the extents of the whole simulation domain and the decomposed grids, respectively.
-	To simplify thing, we require that each GPU is assigned the same amount of work,
-	therefore each GPU in the node is assigned and "subgrid.m" -sized block of data
-	to work with.
+        In this new approach, I have simplified the multi- and single-GPU layers significantly.
+        Quick rundown:
+                New struct: Grid. There are two global variables, "grid" and "subgrid", which
+                contain the extents of the whole simulation domain and the decomposed grids,
+        respectively. To simplify thing, we require that each GPU is assigned the same amount of
+        work, therefore each GPU in the node is assigned and "subgrid.m" -sized block of data to
+        work with.

-	The whole simulation domain is decomposed with respect to the z dimension.
-	For example, if the grid contains (nx, ny, nz) vertices, then the subgrids
-	contain (nx, ny, nz / num_devices) vertices.
+                The whole simulation domain is decomposed with respect to the z dimension.
+                For example, if the grid contains (nx, ny, nz) vertices, then the subgrids
+                contain (nx, ny, nz / num_devices) vertices.

-	An local index (i, j, k) in some subgrid can be mapped to the global grid with
-		global idx = (i, j, k + device_id * subgrid.n.z)
+                An local index (i, j, k) in some subgrid can be mapped to the global grid with
+                        global idx = (i, j, k + device_id * subgrid.n.z)

-Terminology:
-	- Single-GPU function: a function defined on the single-GPU layer (device.cu)
+        Terminology:
+                - Single-GPU function: a function defined on the single-GPU layer (device.cu)

-Changes required to this commented code block:
-	- The thread block dimensions (tpb) are no longer passed to the kernel here but in device.cu
-	  instead. Same holds for any complex index calculations. Instead, the local coordinates
-  	  should be passed as an int3 type without having to consider how the data is actually
-	  laid out in device memory
-	- The unified memory buffer no longer exists (d_buffer). Instead, we have an opaque handle
-	  of type "Device" which should be passed to single-GPU functions. In this file, all devices
-	  are stored in a global array "devices[num_devices]".
-	- Every single-GPU function is executed asynchronously by default such that we
-	  can optimize Astaroth by executing memory transactions concurrently with computation.
-	  Therefore a StreamType should be passed as a parameter to single-GPU functions.
-	  Refresher: CUDA function calls are non-blocking when a stream is explicitly passed
-	  as a parameter and commands executing in different streams can be processed
-	  in parallel/concurrently.
+        Changes required to this commented code block:
+                - The thread block dimensions (tpb) are no longer passed to the kernel here but in
+        device.cu instead. Same holds for any complex index calculations. Instead, the local
+        coordinates should be passed as an int3 type without having to consider how the data is
+        actually laid out in device memory
+                - The unified memory buffer no longer exists (d_buffer). Instead, we have an opaque
+        handle of type "Device" which should be passed to single-GPU functions. In this file, all
+        devices are stored in a global array "devices[num_devices]".
+                - Every single-GPU function is executed asynchronously by default such that we
+                  can optimize Astaroth by executing memory transactions concurrently with
+        computation. Therefore a StreamType should be passed as a parameter to single-GPU functions.
+                  Refresher: CUDA function calls are non-blocking when a stream is explicitly passed
+                  as a parameter and commands executing in different streams can be processed
+                  in parallel/concurrently.


-Note on periodic boundaries (might be helpful when implementing other boundary conditions):
+        Note on periodic boundaries (might be helpful when implementing other boundary conditions):

-	With multiple GPUs, periodic boundary conditions applied on indices ranging from
+                With multiple GPUs, periodic boundary conditions applied on indices ranging from

-		(0, 0, STENCIL_ORDER/2) to (subgrid.m.x, subgrid.m.y, subgrid.m.z - STENCIL_ORDER/2)
+                        (0, 0, STENCIL_ORDER/2) to (subgrid.m.x, subgrid.m.y, subgrid.m.z -
+        STENCIL_ORDER/2)

-	on a single device are "local", in the sense that they can be computed without having
-	to exchange data with neighboring GPUs. Special care is needed only for transferring
-	the data to the fron and back plates outside this range. In the solution we use here,
-	we solve the local boundaries first, and then just exchange the front and back plates
-	in a "ring", like so
-				device_id
-		    (n) <-> 0 <-> 1 <-> ... <-> n <-> (0)
+                on a single device are "local", in the sense that they can be computed without
+        having to exchange data with neighboring GPUs. Special care is needed only for transferring
+                the data to the fron and back plates outside this range. In the solution we use
+        here, we solve the local boundaries first, and then just exchange the front and back plates
+                in a "ring", like so
+                                        device_id
+                            (n) <-> 0 <-> 1 <-> ... <-> n <-> (0)


-// ======MIIKKANOTE END==========================================
+        // ======MIIKKANOTE END==========================================

-<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MIIKKANOTE: This code block was essentially
-                                                          moved into device.cu, function boundCondStep()
-                                                          In astaroth.cu, we use acBoundcondStep()
-                                                          just to distribute the work and manage
-                                                          communication between GPUs.
+        <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MIIKKANOTE: This code block was essentially
+                                                                  moved into device.cu, function
+        boundCondStep() In astaroth.cu, we use acBoundcondStep() just to distribute the work and
+        manage communication between GPUs.

-    printf("Boundconds best dims (%d, %d, %d) %f ms\n", best_dims.x, best_dims.y, best_dims.z, double(best_time) / NUM_ITERATIONS);
+            printf("Boundconds best dims (%d, %d, %d) %f ms\n", best_dims.x, best_dims.y,
+        best_dims.z, double(best_time) / NUM_ITERATIONS);

-    exit(0);
-    #else
+            exit(0);
+            #else


-        const int depth = (int)ceil(mesh_info.int_params[AC_mz]/(float)num_devices);
+                const int depth = (int)ceil(mesh_info.int_params[AC_mz]/(float)num_devices);

-        const int3 start = (int3){0, 0, device_id * depth};
-        const int3 end = (int3){mesh_info.int_params[AC_mx],
-                                mesh_info.int_params[AC_my],
-                                min((device_id+1) * depth, mesh_info.int_params[AC_mz])};
+                const int3 start = (int3){0, 0, device_id * depth};
+                const int3 end = (int3){mesh_info.int_params[AC_mx],
+                                        mesh_info.int_params[AC_my],
+                                        min((device_id+1) * depth, mesh_info.int_params[AC_mz])};

-        const dim3 tpb(8,2,8);
+                const dim3 tpb(8,2,8);

-        // TODO uses the default stream currently
-        if (mesh_info.int_params[AC_bc_type] == 666) { // TODO MAKE A BETTER SWITCH
-            wedge_boundconds(0, tpb, start, end, d_buffer);
-        } else {
-            for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
-                periodic_boundconds(0, tpb, start, end, d_buffer.in[i]);
-<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-*/
+                // TODO uses the default stream currently
+                if (mesh_info.int_params[AC_bc_type] == 666) { // TODO MAKE A BETTER SWITCH
+                    wedge_boundconds(0, tpb, start, end, d_buffer);
+                } else {
+                    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
+                        periodic_boundconds(0, tpb, start, end, d_buffer.in[i]);
+        <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+        */
        // Exchange halos
        for (int i = 0; i < num_devices; ++i) {
-            const int num_vertices = subgrid.m.x * subgrid.m.y * STENCIL_ORDER/2;
+            const int num_vertices = subgrid.m.x * subgrid.m.y * STENCIL_ORDER / 2;
            // ...|ooooxxx|... -> xxx|ooooooo|...
            {
-                const int3 src = (int3) {0, 0, subgrid.n.z};
-                const int3 dst = (int3) {0, 0, 0};
-                copyMeshDeviceToDevice(devices[i], STREAM_PRIMARY, src, devices[(i+1) % num_devices], dst, num_vertices);
+                const int3 src = (int3){0, 0, subgrid.n.z};
+                const int3 dst = (int3){0, 0, 0};
+                copyMeshDeviceToDevice(devices[i], STREAM_PRIMARY, src,
+                                       devices[(i + 1) % num_devices], dst, num_vertices);
            }
            // ...|ooooooo|xxx <- ...|xxxoooo|...
            {
-                const int3 src = (int3) {0, 0, STENCIL_ORDER/2};
-                const int3 dst = (int3) {0, 0, STENCIL_ORDER/2 + subgrid.n.z};
-                copyMeshDeviceToDevice(devices[(i+1) % num_devices], STREAM_PRIMARY, src, devices[i], dst, num_vertices);
+                const int3 src = (int3){0, 0, STENCIL_ORDER / 2};
+                const int3 dst = (int3){0, 0, STENCIL_ORDER / 2 + subgrid.n.z};
+                copyMeshDeviceToDevice(devices[(i + 1) % num_devices], STREAM_PRIMARY, src,
+                                       devices[i], dst, num_vertices);
            }
        }
    }
@@ -427,26 +426,28 @@ simple_final_reduce_scal(const ReductionType& rtype, const AcReal* results, cons
    for (int i = 1; i < n; ++i) {
        if (rtype == RTYPE_MAX) {
            res = max(res, results[i]);
-        } else if (rtype == RTYPE_MIN) {
+        }
+        else if (rtype == RTYPE_MIN) {
            res = min(res, results[i]);
-        } else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
+        }
+        else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
            res = sum(res, results[i]);
-        } else {
+        }
+        else {
            ERROR("Invalid rtype");
        }
    }

    if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
        const AcReal inv_n = AcReal(1.) / (grid.n.x * grid.n.y * grid.n.z);
-        res = sqrt(inv_n * res);
+        res                = sqrt(inv_n * res);
    }

    return res;
 }

 AcReal
-acReduceScal(const ReductionType& rtype,
-             const VertexBufferHandle& vtxbuffer_handle)
+acReduceScal(const ReductionType& rtype, const VertexBufferHandle& vtxbuffer_handle)
 {
    AcReal results[num_devices];
    for (int i = 0; i < num_devices; ++i) {
@@ -457,8 +458,8 @@ acReduceScal(const ReductionType& rtype,
 }

 AcReal
-acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a,
-            const VertexBufferHandle& b, const VertexBufferHandle& c)
+acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a, const VertexBufferHandle& b,
+            const VertexBufferHandle& c)
 {
    AcReal results[num_devices];
    for (int i = 0; i < num_devices; ++i) {