From f3de2fa03ccb18364995606ec56498f89c5ec840 Mon Sep 17 00:00:00 2001 From: jpekkila Date: Mon, 5 Aug 2019 15:03:02 +0300 Subject: [PATCH 1/3] Made globalVertexIdx available during preprocessing. NOTE: potentially dangerous. globalVertexIdx should never be used for reading data from the vertex buffers. --- acc/src/code_generator.c | 13 ++++++++----- src/core/kernels/kernels.cuh | 2 +- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/acc/src/code_generator.c b/acc/src/code_generator.c index 2810f5b..93ca42f 100644 --- a/acc/src/code_generator.c +++ b/acc/src/code_generator.c @@ -335,7 +335,8 @@ traverse(const ASTNode* node) // Preprocessed parameter boilerplate if (node->type == NODE_TYPE_QUALIFIER && node->token == PREPROCESSED) inside_preprocessed = true; - static const char preprocessed_parameter_boilerplate[] = "const int3 vertexIdx, "; + static const char + preprocessed_parameter_boilerplate[] = "const int3 vertexIdx, const int3 globalVertexIdx, "; if (inside_preprocessed && node->type == NODE_FUNCTION_PARAMETER_DECLARATION) printf("%s ", preprocessed_parameter_boilerplate); // BOILERPLATE END//////////////////////////////////////////////////////// @@ -491,6 +492,7 @@ generate_preprocessed_structures(void) // FILLING THE DATA STRUCT printf("static __device__ __forceinline__ AcRealData\ read_data(const int3 vertexIdx,\ + const int3 globalVertexIdx,\ AcReal* __restrict__ buf[], const int handle)\ {\n\ %sData data;\n", @@ -498,7 +500,7 @@ generate_preprocessed_structures(void) for (int i = 0; i < num_symbols; ++i) { if (symbol_table[i].type_qualifier == PREPROCESSED) - printf("data.%s = preprocessed_%s(vertexIdx, buf[handle]);\n", + printf("data.%s = preprocessed_%s(vertexIdx, globalVertexIdx, buf[handle]);\n", symbol_table[i].identifier, symbol_table[i].identifier); } printf("return data;\n"); @@ -526,13 +528,14 @@ generate_preprocessed_structures(void) \ static __device__ __forceinline__ AcReal3Data\ read_data(const int3 vertexIdx,\ + const int3 globalVertexIdx,\ AcReal* __restrict__ buf[], const int3& handle)\ {\ AcReal3Data data;\ \ - data.x = read_data(vertexIdx, buf, handle.x);\ - data.y = read_data(vertexIdx, buf, handle.y);\ - data.z = read_data(vertexIdx, buf, handle.z);\ + data.x = read_data(vertexIdx, globalVertexIdx, buf, handle.x);\ + data.y = read_data(vertexIdx, globalVertexIdx, buf, handle.y);\ + data.z = read_data(vertexIdx, globalVertexIdx, buf, handle.z);\ \ return data;\ }\ diff --git a/src/core/kernels/kernels.cuh b/src/core/kernels/kernels.cuh index 52b48f6..c604205 100644 --- a/src/core/kernels/kernels.cuh +++ b/src/core/kernels/kernels.cuh @@ -671,7 +671,7 @@ read_out(const int idx, AcReal* __restrict__ field[], const int3 handle) } #define WRITE_OUT(handle, value) (write(buffer.out, handle, idx, value)) -#define READ(handle) (read_data(vertexIdx, buffer.in, handle)) +#define READ(handle) (read_data(vertexIdx, globalVertexIdx, buffer.in, handle)) #define READ_OUT(handle) (read_out(idx, buffer.out, handle)) // also write for clarity here also, not for the DSL From 5f4246fb42802afa3b50175f4cb89c051e724fc4 Mon Sep 17 00:00:00 2001 From: jpekkila Date: Tue, 6 Aug 2019 14:46:13 +0300 Subject: [PATCH 2/3] Standalone now uses O2 optimization level instead of O3. Also removed -march=native since this causes issues if the program is compiled on a different architecture than it is run on. Since we do not do heavy arithmetic on the host side and the host code is not performance-critical part of the code, -march-native is not very useful anyways --- src/core/CMakeLists.txt | 3 +-- src/standalone/CMakeLists.txt | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index 5cbc271..b56c770 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -12,8 +12,7 @@ set(CUDA_ARCH_FLAGS -gencode arch=compute_37,code=sm_37 -gencode arch=compute_61,code=sm_61 -lineinfo -ftz=true # Flush denormalized floats to zero - -std=c++11 - --compiler-options -march=native) # Native host machine code + -std=c++11) #--maxrregcount=255 # -Xptxas -dlcm=ca opt-in to cache all global loads to L1/texture cache # =cg to opt out diff --git a/src/standalone/CMakeLists.txt b/src/standalone/CMakeLists.txt index 6a03e32..ed9bda9 100644 --- a/src/standalone/CMakeLists.txt +++ b/src/standalone/CMakeLists.txt @@ -21,7 +21,7 @@ if (BUILD_RT_VISUALIZATION) endif () ## Compilation flags -add_compile_options(-march=native -pipe ${OpenMP_CXX_FLAGS}) +add_compile_options(-O2 -pipe ${OpenMP_CXX_FLAGS}) add_compile_options(-Wall -Wextra -Werror -Wdouble-promotion -Wfloat-conversion)# -Wshadow) ## Compile and link From e4b981fc6204d330439de92df1e27e764d67caf7 Mon Sep 17 00:00:00 2001 From: jpekkila Date: Tue, 6 Aug 2019 14:59:41 +0300 Subject: [PATCH 3/3] Removed the O2 flag since cmake still defines the O3 flag in CMAKE_CXX_FLAGS_RELEASE and it's confusing which one gcc chooses if both O3 and O2 are passed during compilation. If the issue was the march=native flag then this should also work on Tiara --- src/standalone/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/standalone/CMakeLists.txt b/src/standalone/CMakeLists.txt index ed9bda9..ea1d04c 100644 --- a/src/standalone/CMakeLists.txt +++ b/src/standalone/CMakeLists.txt @@ -21,7 +21,7 @@ if (BUILD_RT_VISUALIZATION) endif () ## Compilation flags -add_compile_options(-O2 -pipe ${OpenMP_CXX_FLAGS}) +add_compile_options(-pipe ${OpenMP_CXX_FLAGS}) add_compile_options(-Wall -Wextra -Werror -Wdouble-promotion -Wfloat-conversion)# -Wshadow) ## Compile and link