From f3de2fa03ccb18364995606ec56498f89c5ec840 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Mon, 5 Aug 2019 15:03:02 +0300
Subject: [PATCH 1/3] Made globalVertexIdx available during preprocessing.
 NOTE: potentially dangerous. globalVertexIdx should never be used for reading
 data from the vertex buffers.

---
 acc/src/code_generator.c     | 13 ++++++++-----
 src/core/kernels/kernels.cuh |  2 +-
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/acc/src/code_generator.c b/acc/src/code_generator.c
index 2810f5b..93ca42f 100644
--- a/acc/src/code_generator.c
+++ b/acc/src/code_generator.c
@@ -335,7 +335,8 @@ traverse(const ASTNode* node)
     // Preprocessed parameter boilerplate
     if (node->type == NODE_TYPE_QUALIFIER && node->token == PREPROCESSED)
         inside_preprocessed = true;
-    static const char preprocessed_parameter_boilerplate[] = "const int3 vertexIdx, ";
+    static const char
+        preprocessed_parameter_boilerplate[] = "const int3 vertexIdx, const int3 globalVertexIdx, ";
     if (inside_preprocessed && node->type == NODE_FUNCTION_PARAMETER_DECLARATION)
         printf("%s ", preprocessed_parameter_boilerplate);
     // BOILERPLATE END////////////////////////////////////////////////////////
@@ -491,6 +492,7 @@ generate_preprocessed_structures(void)
     // FILLING THE DATA STRUCT
     printf("static __device__ __forceinline__ AcRealData\
             read_data(const int3 vertexIdx,\
+                const int3 globalVertexIdx,\
             AcReal* __restrict__ buf[], const int handle)\
             {\n\
                 %sData data;\n",
@@ -498,7 +500,7 @@ generate_preprocessed_structures(void)
 
     for (int i = 0; i < num_symbols; ++i) {
         if (symbol_table[i].type_qualifier == PREPROCESSED)
-            printf("data.%s = preprocessed_%s(vertexIdx, buf[handle]);\n",
+            printf("data.%s = preprocessed_%s(vertexIdx, globalVertexIdx, buf[handle]);\n",
                    symbol_table[i].identifier, symbol_table[i].identifier);
     }
     printf("return data;\n");
@@ -526,13 +528,14 @@ generate_preprocessed_structures(void)
         \
         static __device__ __forceinline__ AcReal3Data\
         read_data(const int3 vertexIdx,\
+                  const int3 globalVertexIdx,\
                   AcReal* __restrict__ buf[], const int3& handle)\
         {\
             AcReal3Data data;\
         \
-            data.x = read_data(vertexIdx, buf, handle.x);\
-            data.y = read_data(vertexIdx, buf, handle.y);\
-            data.z = read_data(vertexIdx, buf, handle.z);\
+            data.x = read_data(vertexIdx, globalVertexIdx, buf, handle.x);\
+            data.y = read_data(vertexIdx, globalVertexIdx, buf, handle.y);\
+            data.z = read_data(vertexIdx, globalVertexIdx, buf, handle.z);\
         \
             return data;\
         }\
diff --git a/src/core/kernels/kernels.cuh b/src/core/kernels/kernels.cuh
index 52b48f6..c604205 100644
--- a/src/core/kernels/kernels.cuh
+++ b/src/core/kernels/kernels.cuh
@@ -671,7 +671,7 @@ read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
 }
 
 #define WRITE_OUT(handle, value) (write(buffer.out, handle, idx, value))
-#define READ(handle) (read_data(vertexIdx, buffer.in, handle))
+#define READ(handle) (read_data(vertexIdx, globalVertexIdx, buffer.in, handle))
 #define READ_OUT(handle) (read_out(idx, buffer.out, handle))
 
 // also write for clarity here also, not for the DSL

From 5f4246fb42802afa3b50175f4cb89c051e724fc4 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Tue, 6 Aug 2019 14:46:13 +0300
Subject: [PATCH 2/3] Standalone now uses O2 optimization level instead of O3.
 Also removed -march=native since this causes issues if the program is
 compiled on a different architecture than it is run on. Since we do not do
 heavy arithmetic on the host side and the host code is not
 performance-critical part of the code, -march-native is not very useful
 anyways

---
 src/core/CMakeLists.txt       | 3 +--
 src/standalone/CMakeLists.txt | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index 5cbc271..b56c770 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -12,8 +12,7 @@ set(CUDA_ARCH_FLAGS -gencode arch=compute_37,code=sm_37
                     -gencode arch=compute_61,code=sm_61
                     -lineinfo
                     -ftz=true # Flush denormalized floats to zero
-                    -std=c++11
-                    --compiler-options -march=native) # Native host machine code
+                    -std=c++11)
                     #--maxrregcount=255
                     # -Xptxas -dlcm=ca opt-in to cache all global loads to L1/texture cache
                     # =cg to opt out
diff --git a/src/standalone/CMakeLists.txt b/src/standalone/CMakeLists.txt
index 6a03e32..ed9bda9 100644
--- a/src/standalone/CMakeLists.txt
+++ b/src/standalone/CMakeLists.txt
@@ -21,7 +21,7 @@ if (BUILD_RT_VISUALIZATION)
 endif ()
 
 ## Compilation flags
-add_compile_options(-march=native -pipe ${OpenMP_CXX_FLAGS})
+add_compile_options(-O2 -pipe ${OpenMP_CXX_FLAGS})
 add_compile_options(-Wall -Wextra -Werror -Wdouble-promotion -Wfloat-conversion)# -Wshadow)
 
 ## Compile and link

From e4b981fc6204d330439de92df1e27e764d67caf7 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Tue, 6 Aug 2019 14:59:41 +0300
Subject: [PATCH 3/3] Removed the O2 flag since cmake still defines the O3 flag
 in CMAKE_CXX_FLAGS_RELEASE and it's confusing which one gcc chooses if both
 O3 and O2 are passed during compilation. If the issue was the march=native
 flag then this should also work on Tiara

---
 src/standalone/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/standalone/CMakeLists.txt b/src/standalone/CMakeLists.txt
index ed9bda9..ea1d04c 100644
--- a/src/standalone/CMakeLists.txt
+++ b/src/standalone/CMakeLists.txt
@@ -21,7 +21,7 @@ if (BUILD_RT_VISUALIZATION)
 endif ()
 
 ## Compilation flags
-add_compile_options(-O2 -pipe ${OpenMP_CXX_FLAGS})
+add_compile_options(-pipe ${OpenMP_CXX_FLAGS})
 add_compile_options(-Wall -Wextra -Werror -Wdouble-promotion -Wfloat-conversion)# -Wshadow)
 
 ## Compile and link