From 13e6c8e03daa044f64797ab09939603efd4d3954 Mon Sep 17 00:00:00 2001
From: Carl Pearson <pearson@illinois.edu>
Date: Thu, 19 Sep 2019 16:56:57 -0500
Subject: [PATCH] work on gpu clocks and gpu turbo example

---
 README.md                                   |   4 +
 examples/cpu_cache.cpp                      |   3 +
 examples/gpu_clocks.cu                      |  13 +++
 examples/gpu_turbo.cu                       |  13 +++
 include/perfect/cpu_cache.hpp               | 101 ++------------------
 include/perfect/detail/cache/amd64.hpp      |  35 +++++++
 include/perfect/detail/cache/power.hpp      |  35 +++++++
 include/perfect/detail/nvidia/nvidia-ml.hpp |  26 ++---
 include/perfect/detail/os/linux.hpp         |  26 +++++
 include/perfect/gpu_clocks.hpp              |  42 ++++----
 include/perfect/result.hpp                  |  14 +++
 tools/enable_turbo.cpp                      |  18 +---
 12 files changed, 194 insertions(+), 136 deletions(-)
 create mode 100644 include/perfect/detail/cache/amd64.hpp
 create mode 100644 include/perfect/detail/cache/power.hpp
diff --git a/README.md b/README.md
index 051f293..830d7a7 100644
--- a/README.md
+++ b/README.md
@@ -13,6 +13,10 @@ CPU/GPU performance control library for benchmarking
 - [x] Disable GPU turbo (nvidia)
 - [x] Flush addresses from cache (amd64, POWER)
 
+## API
+
+
+
 ## Wish List
 
 - [ ] Nvidia GPU power monitoring
diff --git a/examples/cpu_cache.cpp b/examples/cpu_cache.cpp
index 550240e..79806c5 100644
--- a/examples/cpu_cache.cpp
+++ b/examples/cpu_cache.cpp
@@ -1,6 +1,9 @@
 #include "perfect/cpu_cache.hpp"
 
 int main(void) {
+
+    using namespace perfect;
+
     int *a = new int[1024];
     flush_all(a, 1024 * sizeof(int));
 
diff --git a/examples/gpu_clocks.cu b/examples/gpu_clocks.cu
index ae465f3..3d2d2ef 100644
--- a/examples/gpu_clocks.cu
+++ b/examples/gpu_clocks.cu
@@ -1,5 +1,18 @@
+#include <iostream>
+
 #include "perfect/gpu_clocks.hpp"
+#include "perfect/init.hpp"
 
 int main(void) {
 
+    using namespace perfect;
+
+    init();
+
+    for (unsigned int gpu = 0; gpu < 1; ++gpu) {
+      PERFECT(perfect::set_max_gpu_clocks(gpu));
+      PERFECT(perfect::reset_gpu_clocks(gpu));
+    }
+
+    return 0;
 }
\ No newline at end of file
diff --git a/examples/gpu_turbo.cu b/examples/gpu_turbo.cu
index 4170786..dfd7fc0 100644
--- a/examples/gpu_turbo.cu
+++ b/examples/gpu_turbo.cu
@@ -1,5 +1,18 @@
 #include "perfect/gpu_turbo.hpp"
+#include "perfect/init.hpp"
+
+#define OR_DIE(expr)
 
 int main(void) {
 
+  using namespace perfect;
+  GpuTurboState state;
+
+  init();
+
+  for (unsigned int gpu = 0; gpu < 1; ++gpu) {
+    PERFECT(perfect::get_gpu_turbo_state(&state, gpu));
+    PERFECT(perfect::disable_gpu_turbo(gpu));
+    PERFECT(perfect::set_gpu_turbo_state(state, gpu));
+  }
 }
\ No newline at end of file
diff --git a/include/perfect/cpu_cache.hpp b/include/perfect/cpu_cache.hpp
index 68a8d77..b28e85c 100644
--- a/include/perfect/cpu_cache.hpp
+++ b/include/perfect/cpu_cache.hpp
@@ -11,116 +11,35 @@ Routines for controlling CPU caching
 #include <iostream>
 
 #ifdef __linux__
-#include <unistd.h>
-#endif
-
-// https://gcc.gnu.org/onlinedocs/gcc/Simple-Constraints.html#Simple-Constraints
-
-inline void flush_line(void *p) {
-#ifdef __powerpc__
-
-  /*
-  PowerISA_V2.07B p. 773
-  dcbf RA,RB,L
-
-  effective address is RA|0 + RB
-  this mnemonic has L=0, which is through all cache levels
-  write block to storage and mark as invalid in all processors
-  */
-
-  /*!
-
-   linux/arch/powerpc/include/asm/cache.h
-  */
-  asm volatile("dcbf 0, %0"
-               : // no outputs
-               : "r"(p)
-               : "memory");
-
-#elif __amd64__
-
-  /*!
-
-  arch/x86/include/asm/special_insns.h
-
-   p139
-  https://www.amd.com/system/files/TechDocs/24594.pdf
-
-  clflush mem8
-  */
-
-  asm volatile("clflush %0"
-               : "+m"(p)
-               : // no inputs
-               : // no clobbers
-  );
+#include "detail/os/linux.hpp"
 #else
-#error "unsupported platform"
-  (void)p;
+#error "unsupported OS"
 #endif
-}
-
-inline void barrier_all() {
 
 #ifdef __powerpc__
-
-  // sync is a mnemonic for sync 0, heavyweight sync
-  asm volatile("sync"
-               : // no outputs
-               : // no inputs
-               : "memory");
-
+#include "detail/cache/power.hpp"
 #elif __amd64__
-
-  asm volatile("mfence"
-               : // no outputs
-               : // no inputs
-               : "memory");
-
+#include "detail/cache/amd64.hpp"
 #else
-#error "unsupported platform"
+#error "unsupported CPU arch"
 #endif
-}
 
-/*! return the smallest cache line size detected on the platform.
-Return 16 if the cache line size could not be detected.
-*/
-size_t cache_linesize() {
-#ifdef __linux__
-  long linesize, var;
-
-  var = sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
-  linesize = var;
-
-  var = sysconf(_SC_LEVEL2_CACHE_LINESIZE);
-  linesize = var ? std::min(linesize, var) : linesize;
-
-  var = sysconf(_SC_LEVEL3_CACHE_LINESIZE);
-  linesize = var ? std::min(linesize, var) : linesize;
-
-  var = sysconf(_SC_LEVEL4_CACHE_LINESIZE);
-  linesize = var ? std::min(linesize, var) : linesize;
-
-  linesize = linesize ? linesize : 16;
-  return linesize;
-#else
-#error "unsupported platform"
-#endif
-}
+namespace perfect {
 
 inline void flush_all(void *p, const size_t n) {
 
   size_t lineSize = cache_linesize();
 
   // cache flush may not be ordered wrt other kinds of accesses
-  barrier_all();
+  detail::barrier_all();
 
   for (size_t i = 0; i < n; i += lineSize) {
     char *c = static_cast<char *>(p);
-    flush_line(&c[i]);
+    detail::flush_line(&c[i]);
   }
 
   // make flushing visible to other accesses
-  barrier_all();
+  detail::barrier_all();
 }
 
+}
\ No newline at end of file
diff --git a/include/perfect/detail/cache/amd64.hpp b/include/perfect/detail/cache/amd64.hpp
new file mode 100644
index 0000000..e0984aa
--- /dev/null
+++ b/include/perfect/detail/cache/amd64.hpp
@@ -0,0 +1,35 @@
+#pragma once
+
+namespace perfect {
+  namespace detail {
+
+
+
+inline void flush_line(void *p) {
+  /*!
+
+  arch/x86/include/asm/special_insns.h
+
+   p139
+  https://www.amd.com/system/files/TechDocs/24594.pdf
+
+  clflush mem8
+  */
+
+  asm volatile("clflush %0"
+               : "+m"(p)
+               : // no inputs
+               : // no clobbers
+  );
+}
+
+inline void barrier_all() {
+
+  asm volatile("mfence"
+               : // no outputs
+               : // no inputs
+               : "memory");
+}
+
+  }
+}
\ No newline at end of file
diff --git a/include/perfect/detail/cache/power.hpp b/include/perfect/detail/cache/power.hpp
new file mode 100644
index 0000000..3cec348
--- /dev/null
+++ b/include/perfect/detail/cache/power.hpp
@@ -0,0 +1,35 @@
+#pragma once
+
+inline void flush_line(void *p) {
+
+  /*
+  PowerISA_V2.07B p. 773
+  dcbf RA,RB,L
+
+  effective address is RA|0 + RB
+  this mnemonic has L=0, which is through all cache levels
+  write block to storage and mark as invalid in all processors
+  */
+
+  /*!
+
+   linux/arch/powerpc/include/asm/cache.h
+  */
+  asm volatile("dcbf 0, %0"
+               : // no outputs
+               : "r"(p)
+               : "memory");
+
+
+}
+
+inline void barrier_all() {
+
+  // sync is a mnemonic for sync 0, heavyweight sync
+  asm volatile("sync"
+               : // no outputs
+               : // no inputs
+               : "memory");
+
+
+}
\ No newline at end of file
diff --git a/include/perfect/detail/nvidia/nvidia-ml.hpp b/include/perfect/detail/nvidia/nvidia-ml.hpp
index 1ac1dfb..40d2ca4 100644
--- a/include/perfect/detail/nvidia/nvidia-ml.hpp
+++ b/include/perfect/detail/nvidia/nvidia-ml.hpp
@@ -18,19 +18,23 @@ inline void checkNvml(nvmlReturn_t result, const char *file, const int line) {
 
 namespace perfect {
 namespace detail {
-std::vector<unsigned int> get_device_memory_clocks(unsigned int index) {
-  std::vector<unsigned int> result;
-  nvmlDevice_t device;
-  nvmlDeviceGetHandleByIndex(index, &device);
-  unsigned int resultCount = 0;
 
-  auto ret = nvmlDeviceGetSupportedMemoryClocks(device, &resultCount, nullptr);
-  if (ret != NVML_ERROR_INSUFFICIENT_SIZE) {
-    NVML(ret);
+Result get_device_memory_clocks(std::vector<unsigned int> &memoryClocksMhz, unsigned int index) {
+  nvmlDevice_t device;
+  nvmlReturn_t ret;
+  ret = nvmlDeviceGetHandleByIndex(index, &device);
+  if (ret != NVML_SUCCESS) {
+    return from_nvml(ret);
   }
-  result.resize(resultCount);
-  NVML(nvmlDeviceGetSupportedMemoryClocks(device, &resultCount, result.data()));
-  return result;
+
+  unsigned int resultCount = 0;
+  ret = nvmlDeviceGetSupportedMemoryClocks(device, &resultCount, nullptr);
+  if (ret != NVML_ERROR_INSUFFICIENT_SIZE) {
+    return from_nvml(ret);
+  }
+  memoryClocksMhz.resize(resultCount);
+  NVML(nvmlDeviceGetSupportedMemoryClocks(device, &resultCount, memoryClocksMhz.data()));
+  return Result::SUCCESS;
 }
 
 Result get_device_graphics_clocks(std::vector<unsigned int> &graphicsClocksMhz,
diff --git a/include/perfect/detail/os/linux.hpp b/include/perfect/detail/os/linux.hpp
index 6568c70..8cbe4c7 100644
--- a/include/perfect/detail/os/linux.hpp
+++ b/include/perfect/detail/os/linux.hpp
@@ -51,4 +51,30 @@ Result set_governor(const int cpu, const std::string &governor) {
   return Result::SUCCESS;
 }
 
+/*! return the smallest cache line size detected on the platform.
+Return 16 if the cache line size could not be detected.
+*/
+size_t cache_linesize() {
+#ifdef __linux__
+  long linesize, var;
+
+  var = sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
+  linesize = var;
+
+  var = sysconf(_SC_LEVEL2_CACHE_LINESIZE);
+  linesize = var ? std::min(linesize, var) : linesize;
+
+  var = sysconf(_SC_LEVEL3_CACHE_LINESIZE);
+  linesize = var ? std::min(linesize, var) : linesize;
+
+  var = sysconf(_SC_LEVEL4_CACHE_LINESIZE);
+  linesize = var ? std::min(linesize, var) : linesize;
+
+  linesize = linesize ? linesize : 16;
+  return linesize;
+#else
+#error "unsupported platform"
+#endif
+}
+
 } // namespace perfect
\ No newline at end of file
diff --git a/include/perfect/gpu_clocks.hpp b/include/perfect/gpu_clocks.hpp
index e719b8d..6cf4226 100644
--- a/include/perfect/gpu_clocks.hpp
+++ b/include/perfect/gpu_clocks.hpp
@@ -1,5 +1,7 @@
 #pragma once
 
+#include <algorithm>
+
 #include "detail/nvidia/nvidia-ml.hpp"
 
 namespace perfect {
@@ -8,22 +10,28 @@ namespace perfect {
  */
 Result set_max_gpu_clocks(unsigned int idx) {
 
-  Result rt;
+  Result ret;
   std::vector<unsigned int> clksMhz;
+  nvmlDevice_t device;
 
-  ret = get_device_memory_clocks(clksMhz, idx);
-
-  auto maxMemMhz = *std::max_element(memClksMhz.begin(), memClksMhz.end());
-  ret = get_device_graphics_clocks(clksMhz, idx);
-  auto maxCoreMhz = *std::max_element(clksMhz.begin(), clksMhz.end());
-
-  auto ret = nvmlDeviceSetApplicationsClocks(device, maxMemMhz, maxCoreMhz);
-  if (ret == NVML_ERROR_NOT_SUPPORTED) {
-    return Result::NVML_NOT_SUPPORTED;
-  } else if (ret == NVML_ERROR_NO_PERMISSION) {
-    return Result::NVML_NO_PERMISSION;
+  ret = from_nvml(nvmlDeviceGetHandleByIndex(idx, &device));
+  if (ret != Result::SUCCESS) {
+    return ret;
   }
-  return Result::SUCCESS;
+
+  ret = detail::get_device_memory_clocks(clksMhz, idx);
+    if (ret != Result::SUCCESS) {
+    return ret;
+    }
+
+  auto maxMemMhz = *std::max_element(clksMhz.begin(), clksMhz.end());
+  ret = detail::get_device_graphics_clocks(clksMhz, idx, maxMemMhz);
+  if (ret != Result::SUCCESS) {
+    return ret;
+  }
+  auto maxCoreMhz = *std::max_element(clksMhz.begin(), clksMhz.end());
+  return from_nvml(
+      nvmlDeviceSetApplicationsClocks(device, maxMemMhz, maxCoreMhz));
 }
 
 /*! Reset GPU clocks to default behavior
@@ -36,13 +44,7 @@ Result reset_gpu_clocks(unsigned int idx) {
   if (ret != NVML_SUCCESS) {
     assert(false);
   }
-  ret = nvmlDeviceResetApplicationsClocks(device);
-  if (ret == NVML_ERROR_NOT_SUPPORTED) {
-    return Result::NVML_NOT_SUPPORTED;
-  } else if (ret == NVML_ERROR_NO_PERMISSION) {
-    return Result::NVML_NO_PERMISSION;
-  }
-  return Result::SUCCESS;
+  return from_nvml(nvmlDeviceResetApplicationsClocks(device));
 }
 
 }; // namespace perfect
\ No newline at end of file
diff --git a/include/perfect/result.hpp b/include/perfect/result.hpp
index 5cf3ba4..b0c6fce 100644
--- a/include/perfect/result.hpp
+++ b/include/perfect/result.hpp
@@ -14,6 +14,10 @@ enum class Result {
   UNKNOWN
 };
 
+
+
+
+
 Result from_nvml(nvmlReturn_t nvml) {
   switch (nvml) {
   case NVML_SUCCESS:
@@ -51,4 +55,14 @@ const char *get_string(const Result &result) {
   return "";
 }
 
+inline void check(Result result, const char *file, const int line) {
+  if (result != Result::SUCCESS) {
+    fprintf(stderr, "%s@%d: perfect Error: %s\n", file, line,
+           get_string(result));
+    exit(-1);
+  }
+}
+
 } // namespace perfect
+
+#define PERFECT(stmt) check(stmt, __FILE__, __LINE__);
diff --git a/tools/enable_turbo.cpp b/tools/enable_turbo.cpp
index 208a8e4..ca97206 100644
--- a/tools/enable_turbo.cpp
+++ b/tools/enable_turbo.cpp
@@ -12,24 +12,14 @@ int main(void) {
 
   perfect::init();
 
-  ret = get_cpu_turbo_state(&state);
-
-  if (ret != Result::SUCCESS) {
-    std::cerr << "ERROR: " << get_string(ret) << "\n";
-    exit(EXIT_FAILURE);
-  }
+  PERFECT(get_cpu_turbo_state(&state));
 
   if (is_turbo_enabled(state)) {
-    std::cerr << "turbo already enabled\n";
+    std::cerr << "cpu turbo already enabled\n";
     exit(EXIT_SUCCESS);
   } else {
-    ret = enable_cpu_turbo();
-    if (ret != Result::SUCCESS) {
-      std::cerr << "ERROR: " << get_string(ret) << "\n";
-      exit(EXIT_FAILURE);
-    } else {
-      std::cerr << "enabled turbo\n";
+    PERFECT(enable_cpu_turbo());
+    std::cerr << "enabled cpu turbo\n";
       exit(EXIT_SUCCESS);
-    }
   }
 }
\ No newline at end of file