Added the machinery for implementing forcing with the DSL on multiple GPUs and a simple model solution

2019-06-18 16:13:32 +03:00
parent 57e2e48fb0
commit 4ca4dbefdf
5 changed files with 55 additions and 53 deletions
--- a/src/core/astaroth.cu
+++ b/src/core/astaroth.cu
@@ -40,11 +40,6 @@ static const int MAX_NUM_DEVICES = 32;
 static int num_devices = 1;
 static Device devices[MAX_NUM_DEVICES] = {};

-typedef struct {
-    int3 m;
-    int3 n;
-} Grid;
-
 static Grid
 createGrid(const AcMeshInfo& config)
 {
@@ -132,6 +127,7 @@ acInit(const AcMeshInfo& config)
    // Initialize the devices
    for (int i = 0; i < num_devices; ++i) {
        createDevice(i, subgrid_config, &devices[i]);
+        loadGlobalGrid(devices[i], grid);
        printDeviceInfo(devices[i]);
    }
    return AC_SUCCESS;
--- a/src/core/device.cu
+++ b/src/core/device.cu
@@ -35,6 +35,7 @@ typedef struct {

 __constant__ AcMeshInfo d_mesh_info;
 __constant__ int3 d_multigpu_offset;
+__constant__ Grid globalGrid;
 #define DCONST_INT(X)  (d_mesh_info.int_params[X])
 #define DCONST_REAL(X) (d_mesh_info.real_params[X])
 #define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_mx) + (k)*DCONST_INT(AC_mxy))
@@ -377,3 +378,12 @@ loadDeviceConstant(const Device device, const AcRealParam param, const AcReal va
                                          offset, cudaMemcpyHostToDevice));
    return AC_SUCCESS;
 }
+
+AcResult
+loadGlobalGrid(const Device device, const Grid grid)
+{
+    cudaSetDevice(device->id);
+    ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(globalGrid, &grid, sizeof(grid),
+                                          0, cudaMemcpyHostToDevice));
+    return AC_SUCCESS;
+}
--- a/src/core/device.cuh
+++ b/src/core/device.cuh
@@ -34,6 +34,11 @@ typedef enum {
  STREAM_ALL
 } StreamType;

+typedef struct {
+    int3 m;
+    int3 n;
+} Grid;
+
 typedef struct device_s* Device; // Opaque pointer to device_s. Analogous to dispatchable handles
                                 // in Vulkan, f.ex. VkDevice

@@ -92,3 +97,6 @@ AcResult loadDeviceConstant(const Device device, const AcIntParam param, const i

 /** */
 AcResult loadDeviceConstant(const Device device, const AcRealParam param, const AcReal value);
+
+/** */
+AcResult loadGlobalGrid(const Device device, const Grid grid);
--- a/src/core/kernels/kernels.cuh
+++ b/src/core/kernels/kernels.cuh
@@ -727,6 +727,9 @@ read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
        const int3 vertexIdx = (int3){threadIdx.x + blockIdx.x * blockDim.x + start.x,\
                                                            threadIdx.y + blockIdx.y * blockDim.y + start.y,\
                                                            threadIdx.z + blockIdx.z * blockDim.z + start.z};\
+        const int3 globalVertexIdx = (int3){d_multigpu_offset.x + vertexIdx.x, \
+                                            d_multigpu_offset.y + vertexIdx.y, \
+                                            d_multigpu_offset.z + vertexIdx.z}; \
        if (vertexIdx.x >= end.x || vertexIdx.y >= end.y || vertexIdx.z >= end.z)\
            return;\
 \