Added some preliminary pragma omps and verified that acIntegrate works as it should.

This commit is contained in:
jpekkila
2019-08-07 19:08:52 +03:00
parent c2bd5ae3e6
commit 1525e0603f
4 changed files with 35 additions and 6 deletions

View File

@@ -4,6 +4,7 @@
## Find packages
find_package(CUDA REQUIRED)
# find_package(OpenMP REQUIRED)
## Architecture and optimization flags
set(CUDA_ARCH_FLAGS -gencode arch=compute_37,code=sm_37
@@ -12,7 +13,8 @@ set(CUDA_ARCH_FLAGS -gencode arch=compute_37,code=sm_37
-gencode arch=compute_61,code=sm_61
-lineinfo
-ftz=true # Flush denormalized floats to zero
-std=c++11)
-std=c++11
)# --compiler-options ${OpenMP_CXX_FLAGS})
#--maxrregcount=255
# -Xptxas -dlcm=ca opt-in to cache all global loads to L1/texture cache
# =cg to opt out

View File

@@ -73,8 +73,11 @@ acStore(AcMesh* host_mesh)
AcResult
acIntegrate(const AcReal dt)
{
/*
acNodeIntegrate(nodes[0], dt);
return acBoundcondStep();
*/
return acNodeIntegrate(nodes[0], dt);
}
AcResult

View File

@@ -162,6 +162,7 @@ acNodeCreate(const int id, const AcMeshInfo node_config, Node* node_handle)
#endif
// Initialize the devices
// #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) {
const int3 multinode_offset = (int3){0, 0, 0}; // Placeholder
const int3 multigpu_offset = (int3){0, 0, i * node->subgrid.n.z};
@@ -173,6 +174,7 @@ acNodeCreate(const int id, const AcMeshInfo node_config, Node* node_handle)
}
// Enable peer access
// #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) {
const int front = (i + 1) % node->num_devices;
const int back = (i - 1 + node->num_devices) % node->num_devices;
@@ -205,6 +207,7 @@ acNodeDestroy(Node node)
{
acNodeSynchronizeStream(node, STREAM_ALL);
// #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) {
acDeviceDestroy(node->devices[i]);
}
@@ -241,6 +244,7 @@ acNodeAutoOptimize(const Node node)
AcResult
acNodeSynchronizeStream(const Node node, const Stream stream)
{
// #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) {
acDeviceSynchronizeStream(node->devices[i], stream);
}
@@ -267,6 +271,7 @@ acNodeSynchronizeVertexBuffer(const Node node, const Stream stream,
const size_t num_vertices = node->subgrid.m.x * node->subgrid.m.y * NGHOST;
// #pragma omp parallel for
for (int i = 0; i < node->num_devices - 1; ++i) {
// ...|ooooxxx|... -> xxx|ooooooo|...
const int3 src = (int3){0, 0, node->subgrid.n.z};
@@ -278,6 +283,7 @@ acNodeSynchronizeVertexBuffer(const Node node, const Stream stream,
acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst,
num_vertices, dst_device);
}
// #pragma omp parallel for
for (int i = 1; i < node->num_devices; ++i) {
// ...|ooooooo|xxx <- ...|xxxoooo|...
const int3 src = (int3){0, 0, NGHOST};
@@ -305,6 +311,7 @@ acNodeSynchronizeMesh(const Node node, const Stream stream)
AcResult
acNodeSwapBuffers(const Node node)
{
// #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) {
acDeviceSwapBuffers(node->devices[i]);
}
@@ -316,6 +323,7 @@ acNodeLoadConstant(const Node node, const Stream stream, const AcRealParam param
const AcReal value)
{
acNodeSynchronizeStream(node, stream);
// #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) {
acDeviceLoadConstant(node->devices[i], stream, param, value);
}
@@ -329,6 +337,7 @@ acNodeLoadVertexBufferWithOffset(const Node node, const Stream stream, const AcM
{
acNodeSynchronizeStream(node, stream);
// See the beginning of the file for an explanation of the index mapping
// // #pragma omp parallel for
// #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) {
const int3 d0 = (int3){0, 0, i * node->subgrid.n.z}; // DECOMPOSITION OFFSET HERE
@@ -404,6 +413,7 @@ acNodeStoreVertexBufferWithOffset(const Node node, const Stream stream,
const int3 dst, const int num_vertices, AcMesh* host_mesh)
{
acNodeSynchronizeStream(node, stream);
// #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) {
const int3 d0 = (int3){0, 0, i * node->subgrid.n.z}; // DECOMPOSITION OFFSET HERE
const int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, d0.z + node->subgrid.m.z};
@@ -466,6 +476,7 @@ acNodeIntegrateSubstep(const Node node, const Stream stream, const int isubstep,
{
acNodeSynchronizeStream(node, stream);
// #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) {
// DECOMPOSITION OFFSET HERE
const int3 d0 = (int3){NGHOST, NGHOST, NGHOST + i * node->subgrid.n.z};
@@ -490,6 +501,7 @@ local_boundcondstep(const Node node, const Stream stream, const VertexBufferHand
if (node->num_devices > 1) {
// Local boundary conditions
// #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) {
const int3 d0 = (int3){0, 0, NGHOST}; // DECOMPOSITION OFFSET HERE
const int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, d0.z + node->subgrid.n.z};
@@ -543,8 +555,8 @@ acNodeIntegrate(const Node node, const AcReal dt)
// xxx|OOO OOOOOOOOO OOO|xxx
// ^ ^ ^ ^
// n0 n1 n2 n3
const int3 n0 = (int3){NGHOST, NGHOST, NGHOST};
const int3 n1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
// const int3 n0 = (int3){NGHOST, NGHOST, NGHOST};
// const int3 n1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
// const int3 n2 = node->grid.n;
// const int3 n3 = n0 + node->grid.n;
@@ -554,12 +566,15 @@ acNodeIntegrate(const Node node, const AcReal dt)
local_boundcondstep(node, (Stream)vtxbuf, (VertexBufferHandle)vtxbuf);
}
acNodeSynchronizeStream(node, STREAM_ALL);
// Inner inner
// #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) {
const int3 m1 = n1;
const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
const int3 m2 = node->subgrid.n;
acDeviceIntegrateSubstep(node->devices[i], STREAM_16, isubstep, m1, m2, dt);
}
for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
acNodeSynchronizeVertexBuffer(node, (Stream)vtxbuf, (VertexBufferHandle)vtxbuf);
global_boundcondstep(node, (Stream)vtxbuf, (VertexBufferHandle)vtxbuf);
@@ -568,32 +583,38 @@ acNodeIntegrate(const Node node, const AcReal dt)
acNodeSynchronizeStream(node, (Stream)vtxbuf);
}
// #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) { // Front
const int3 m1 = (int3){NGHOST, NGHOST, NGHOST};
const int3 m2 = m1 + (int3){node->subgrid.n.x, node->subgrid.n.y, NGHOST};
acDeviceIntegrateSubstep(node->devices[i], STREAM_0, isubstep, m1, m2, dt);
}
// #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) { // Back
const int3 m1 = (int3){NGHOST, NGHOST, node->subgrid.n.z};
const int3 m2 = m1 + (int3){node->subgrid.n.x, node->subgrid.n.y, NGHOST};
acDeviceIntegrateSubstep(node->devices[i], STREAM_1, isubstep, m1, m2, dt);
}
// #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) { // Bottom
const int3 m1 = (int3){NGHOST, NGHOST, 2 * NGHOST};
const int3 m2 = m1 + (int3){node->subgrid.n.x, NGHOST, node->subgrid.n.z - 2 * NGHOST};
acDeviceIntegrateSubstep(node->devices[i], STREAM_2, isubstep, m1, m2, dt);
}
// #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) { // Top
const int3 m1 = (int3){NGHOST, node->subgrid.n.y, 2 * NGHOST};
const int3 m2 = m1 + (int3){node->subgrid.n.x, NGHOST, node->subgrid.n.z - 2 * NGHOST};
acDeviceIntegrateSubstep(node->devices[i], STREAM_3, isubstep, m1, m2, dt);
}
// #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) { // Left
const int3 m1 = (int3){NGHOST, 2 * NGHOST, 2 * NGHOST};
const int3 m2 = m1 + (int3){NGHOST, node->subgrid.n.y - 2 * NGHOST,
node->subgrid.n.z - 2 * NGHOST};
acDeviceIntegrateSubstep(node->devices[i], STREAM_4, isubstep, m1, m2, dt);
}
// #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) { // Right
const int3 m1 = (int3){node->subgrid.n.x, 2 * NGHOST, 2 * NGHOST};
const int3 m2 = m1 + (int3){NGHOST, node->subgrid.n.y - 2 * NGHOST,
@@ -663,6 +684,7 @@ acNodeReduceScal(const Node node, const Stream stream, const ReductionType rtype
acNodeSynchronizeStream(node, STREAM_ALL);
AcReal results[node->num_devices];
// #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) {
acDeviceReduceScal(node->devices[i], stream, rtype, vtxbuf_handle, &results[i]);
}
@@ -679,6 +701,7 @@ acNodeReduceVec(const Node node, const Stream stream, const ReductionType rtype,
acNodeSynchronizeStream(node, STREAM_ALL);
AcReal results[node->num_devices];
// #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) {
acDeviceReduceVec(node->devices[i], stream, rtype, a, b, c, &results[i]);
}

View File

@@ -29,13 +29,13 @@
#include <stdio.h>
#include "config_loader.h"
#include "src/core/math_utils.h"
#include "model/host_forcing.h"
#include "model/host_memory.h"
#include "model/host_timestep.h"
#include "model/model_boundconds.h"
#include "model/model_reduce.h"
#include "model/model_rk3.h"
#include "src/core/math_utils.h"
#include "src/core/errchk.h"
@@ -431,8 +431,9 @@ check_rk3(const AcMeshInfo& mesh_info)
acIntegrate(dt);
model_rk3(dt, model_mesh);
boundconds(model_mesh->info, model_mesh);
}
boundconds(model_mesh->info, model_mesh);
acBoundcondStep();
acStore(gpu_mesh);
bool is_acceptable = verify_meshes(*model_mesh, *gpu_mesh);