initial commit

2019-09-19 10:59:28 -05:00
parent a8a014e706
commit f51ef904fb
18 changed files with 572 additions and 0 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -0,0 +1,45 @@
+# 3.8+ for project(LANGUAGES CUDA)
+# 3.9+ for OpenMP::OpenMP_CXX
+# 3.10+ findopenmp gained support for language-specific components
+# 3.11+ for CMake not to add -fopenmp to the nvcc flags
+cmake_minimum_required(VERSION 3.11 FATAL_ERROR)
+
+project(perfect LANGUAGES CUDA CXX VERSION 0.1.0)
+message(STATUS "Build type: " ${CMAKE_BUILD_TYPE})
+
+#https://blog.kitware.com/cmake-and-the-default-build-type/
+# Set a default build type if none was specified
+set(default_build_type "Release")
+if(EXISTS "${CMAKE_SOURCE_DIR}/.git")
+  set(default_build_type "Debug")
+endif()
+if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
+  message(STATUS "Setting build type to '${default_build_type}' as none was specified.")
+  set(CMAKE_BUILD_TYPE "${default_build_type}" CACHE
+      STRING "Choose the type of build." FORCE)
+  # Set the possible values of build type for cmake-gui
+  set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS
+    "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
+endif()
+
+
+if (CMAKE_BUILD_TYPE MATCHES Debug)
+  message(STATUS "Setting verbose build during Debug")
+  set(CMAKE_VERBOSE_MAKEFILE ON)
+  set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} -G)
+elseif (CMAKE_BUILD_TYPE MATCHES Release)
+  add_definitions(-DNDEBUG)
+  set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} -lineinfo)
+endif()
+
+
+
+set(CMAKE_CUDA_STANDARD 11)
+
+add_subdirectory(include/perfect)
+
+add_library(perfect INTERFACE)
+target_include_directories(perfect INTERFACE include/)
+
+
+add_subdirectory(examples)
--- a/README.md
+++ b/README.md
@@ -1,2 +1,19 @@
 # perfect
+
 CPU/GPU performance control library for benchmarking
+* x86
+* POWER
+* Nvidia
+
+## Features
+
+- [x] Disable CPU turbo (linux)
+- [x] Set OS CPU performance mode to maximum (linux)
+- [x] Set GPU clocks (nvidia)
+- [x] Disable GPU turbo (nvidia)
+- [x] Flush addresses from cache (amd64, POWER)
+
+## Wish List
+
+- [ ] Nvidia GPU power monitoring
+- [ ] Nivida GPU utilization monitoring
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -0,0 +1,50 @@
+# removed -Wredundant-decls for cuda 10.1
+# removed -Wundef for cuda 10.0
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} \
+ -Xcompiler=-Wall\
+ -Xcompiler=-Wextra\
+ -Xcompiler=-Wcast-qual \
+ -Xcompiler=-Wcast-align \
+ -Xcompiler=-Wstrict-aliasing \
+ -Xcompiler=-Wpointer-arith \
+ -Xcompiler=-Winit-self \
+ -Xcompiler=-Wshadow \
+ -Xcompiler=-Wswitch-enum \
+ -Xcompiler=-Wfloat-equal \
+ -Xcompiler=-Wvla\
+ -Xcompiler=-fmax-errors=1 \
+ -Xcompiler=-Wfatal-errors\
+ ")
+
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} \
+ -Wredundant-decls \
+ -Wundef \
+ -Wall\
+ -Wextra\
+ -Wcast-qual \
+ -Wcast-align \
+ -Wstrict-aliasing \
+ -Wpointer-arith \
+ -Winit-self \
+ -Wshadow \
+ -Wswitch-enum \
+ -Wfloat-equal \
+ -Wvla\
+ -fmax-errors=1 \
+ -Wfatal-errors\
+ ")
+
+add_executable(cpu-cache cpu_cache.cpp)
+target_link_libraries(cpu-cache perfect)
+
+add_executable(cpu-turbo cpu_turbo.cpp)
+target_link_libraries(cpu-turbo perfect)
+
+add_executable(os-perf os_perf.cpp)
+target_link_libraries(os-perf perfect)
+
+add_executable(gpu-clocks gpu_clocks.cu)
+target_link_libraries(gpu-clocks perfect)
+
+add_executable(gpu-turbo gpu_turbo.cu)
+target_link_libraries(gpu-turbo perfect)
--- a/examples/cpu_cache.cpp
+++ b/examples/cpu_cache.cpp
@@ -0,0 +1,11 @@
+#include "perfect/cpu_cache.hpp"
+
+int main(void) {
+    int *a = new int[1024];
+    flush_all(a, 1024 * sizeof(int));
+
+    // do things with `a` flushed from cache into main memory
+    // furthermore, all loads and stores before this function call are guaranteed to be complete
+
+    delete[] a;
+}
--- a/examples/cpu_turbo.cpp
+++ b/examples/cpu_turbo.cpp
@@ -0,0 +1,14 @@
+#include "perfect/cpu_turbo.hpp"
+
+int main(void) {
+
+    perfect::CpuTurboState state;
+    perfect::get_cpu_turbo_state(&state);
+
+    perfect::disable_cpu_turbo();
+
+    // do things with CPU turbo disabled
+
+    perfect::set_cpu_turbo_state(state);
+
+}
--- a/examples/gpu_clocks.cu
+++ b/examples/gpu_clocks.cu
@@ -0,0 +1,5 @@
+#include "perfect/gpu_clocks.hpp"
+
+int main(void) {
+
+}
--- a/examples/gpu_turbo.cu
+++ b/examples/gpu_turbo.cu
@@ -0,0 +1,5 @@
+#include "perfect/gpu_turbo.hpp"
+
+int main(void) {
+
+}
--- a/examples/os_perf.cpp
+++ b/examples/os_perf.cpp
@@ -0,0 +1,26 @@
+#include "perfect/os_perf.hpp"
+
+#include <map>
+
+int main(void) {
+
+
+    std::map<int, perfect::OsPerfState> states;
+
+    for (auto cpu : perfect::cpus()) {
+        perfect::OsPerfState state;
+        perfect::get_os_perf_state(&state, cpu);
+        states[cpu] = state;
+        perfect::os_perf_state_maximum(cpu);
+    }
+
+    // do things with all CPUs set to the maximum performancem mode by the OS
+
+    for (auto kv : states) {
+        int cpu = kv.first;
+        perfect::OsPerfState state = kv.second;
+        perfect::set_os_perf_state(cpu, state);
+    }
+    
+
+}
--- a/include/perfect/CMakeLists.txt
+++ b/include/perfect/CMakeLists.txt
@@ -0,0 +1,4 @@
+set(PERFECT_HEADERS ${PERFECT_HEADERS} cpu_cache.hpp PARENT_SCOPE)
+set(PERFECT_HEADERS ${PERFECT_HEADERS} cpu_turbo.hpp PARENT_SCOPE)
+set(PERFECT_HEADERS ${PERFECT_HEADERS} os_perf.hpp PARENT_SCOPE)
+set(PERFECT_HEADERS ${PERFECT_HEADERS} result.hpp PARENT_SCOPE)
--- a/include/perfect/cpu_cache.hpp
+++ b/include/perfect/cpu_cache.hpp
@@ -0,0 +1,126 @@
+#pragma once
+
+/*!
+Routines for controlling CPU caching
+*/
+
+#pragma once
+
+#include <algorithm>
+#include <cstdio>
+#include <iostream>
+
+#ifdef __linux__
+#include <unistd.h>
+#endif
+
+// https://gcc.gnu.org/onlinedocs/gcc/Simple-Constraints.html#Simple-Constraints
+
+inline void flush_line(void *p) {
+#ifdef __powerpc__
+
+  /*
+  PowerISA_V2.07B p. 773
+  dcbf RA,RB,L
+
+  effective address is RA|0 + RB
+  this mnemonic has L=0, which is through all cache levels
+  write block to storage and mark as invalid in all processors
+  */
+
+  /*!
+
+   linux/arch/powerpc/include/asm/cache.h
+  */
+  asm volatile("dcbf 0, %0"
+               : // no outputs
+               : "r"(p)
+               : "memory");
+
+#elif __amd64__
+
+  /*!
+
+  arch/x86/include/asm/special_insns.h
+
+   p139
+  https://www.amd.com/system/files/TechDocs/24594.pdf
+
+  clflush mem8
+  */
+
+  asm volatile("clflush %0"
+               : "+m"(p)
+               : // no inputs
+               : // no clobbers
+  );
+#else
+#error "unsupported platform"
+  (void)p;
+#endif
+}
+
+inline void barrier_all() {
+
+#ifdef __powerpc__
+
+  // sync is a mnemonic for sync 0, heavyweight sync
+  asm volatile("sync"
+               : // no outputs
+               : // no inputs
+               : "memory");
+
+#elif __amd64__
+
+  asm volatile("mfence"
+               : // no outputs
+               : // no inputs
+               : "memory");
+
+#else
+#error "unsupported platform"
+#endif
+}
+
+/*! return the smallest cache line size detected on the platform.
+Return 16 if the cache line size could not be detected.
+*/
+size_t cache_linesize() {
+#ifdef __linux__
+  long linesize, var;
+
+  var = sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
+  linesize = var;
+
+  var = sysconf(_SC_LEVEL2_CACHE_LINESIZE);
+  linesize = var ? std::min(linesize, var) : linesize;
+
+  var = sysconf(_SC_LEVEL3_CACHE_LINESIZE);
+  linesize = var ? std::min(linesize, var) : linesize;
+
+  var = sysconf(_SC_LEVEL4_CACHE_LINESIZE);
+  linesize = var ? std::min(linesize, var) : linesize;
+
+  linesize = linesize ? linesize : 16;
+  return linesize;
+#else
+#error "unsupported platform"
+#endif
+}
+
+inline void flush_all(void *p, const size_t n) {
+
+  size_t lineSize = cache_linesize();
+
+  // cache flush may not be ordered wrt other kinds of accesses
+  barrier_all();
+
+  for (size_t i = 0; i < n; i += lineSize) {
+    char *c = static_cast<char *>(p);
+    flush_line(&c[i]);
+  }
+
+  // make flushing visible to other accesses
+  barrier_all();
+}
+
--- a/include/perfect/cpu_turbo.hpp
+++ b/include/perfect/cpu_turbo.hpp
@@ -0,0 +1,41 @@
+#pragma once
+
+#ifdef __linux__ // linux
+#include "detail/os/linux.hpp"
+
+#ifdef __amd64__
+#include "detail/turbo/linux_amd64.hpp"
+#elif __powerpc64__
+#include "detail/turbo/linux_power.hpp"
+#else
+#error "unsupported CPU arch"
+#endif
+
+#else // not linux
+#error "unsupported OS"
+#endif
+
+
+
+#include "result.hpp"
+
+namespace perfect {
+
+struct CpuTurboState {
+    bool enabled;
+};
+
+
+Result get_cpu_turbo_state(CpuTurboState *state) {
+    state->enabled = is_turbo_enabled();
+}
+
+Result set_cpu_turbo_state(CpuTurboState state) {
+    if (state.enabled) {
+        enable_cpu_turbo();
+    } else {
+        disable_cpu_turbo();
+    }
+}
+
+};
--- a/include/perfect/detail/os/linux.hpp
+++ b/include/perfect/detail/os/linux.hpp
@@ -0,0 +1,51 @@
+#pragma once
+
+#include <string>
+#include <fstream>
+#include <cassert>
+#include <vector>
+
+#include <sched.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "perfect/result.hpp"
+
+namespace perfect {
+
+std::vector<int> cpus() {
+    std::vector<int> result;
+    cpu_set_t mask;
+    if (sched_getaffinity(0 /*caller*/, sizeof(cpu_set_t), &mask)) {
+        assert(0 && "failed sched_getaffinity");
+    }
+    for (int i = 0; i < CPU_SETSIZE; ++i) {
+        if (CPU_ISSET(i, &mask)) {
+            result.push_back(i);
+        }
+    }
+}
+
+Result get_governor(std::string &result, const int cpu) {
+  std::string path("/sys/devices/system/cpu/cpu");
+  path += std::to_string(cpu);
+  path += "/cpufreq/scaling_governor";
+  std::ifstream ifs(path, std::ifstream::in);
+  std::getline(ifs, result);
+  return Result::SUCCESS;
+}
+
+Result set_governor(const int cpu, const std::string &governor) {
+  std::string path("/sys/devices/system/cpu/cpu");
+  path += std::to_string(cpu);
+  path += "/cpufreq/scaling_governor";
+  std::ofstream ofs(path, std::ofstream::out);
+  ofs << governor;
+  ofs.close();
+  if (ofs.fail()) {
+    return Result::NO_PERMISSION;
+  }
+  return Result::SUCCESS;
+}
+
+}
--- a/include/perfect/detail/turbo/linux_amd64.hpp
+++ b/include/perfect/detail/turbo/linux_amd64.hpp
@@ -0,0 +1,50 @@
+#pragma once
+
+#include <cassert>
+#include <fstream>
+
+#include "perfect/result.hpp"
+
+namespace perfect {
+
+bool has_intel_pstate_no_turbo() {
+  return bool(std::ifstream("/sys/devices/system/cpu/intel_pstate/no_turbo"));
+}
+
+int write_intel_pstate_no_turbo(const std::string &s) {
+  assert(has_intel_pstate_no_turbo());
+  std::string path("/sys/devices/system/cpu/intel_pstate/no_turbo");
+//   SPDLOG_LOGGER_DEBUG(logger::console(), "writing {} to {}", s, path);
+  std::ofstream ofs(path, std::ofstream::out);
+  ofs << s;
+  ofs.close();
+  if (ofs.fail()) {
+    // SPDLOG_LOGGER_DEBUG(logger::console(), "error writing {} to {}", s, path);
+    return 1;
+  }
+  return 0;
+}
+
+std::string read_intel_pstate_no_turbo() {
+  assert(has_intel_pstate_no_turbo());
+  std::string path("/sys/devices/system/cpu/intel_pstate/no_turbo");
+//   SPDLOG_LOGGER_TRACE(logger::console(), "reading {}", path);
+  std::ifstream ifs(path, std::ifstream::in);
+  std::string result;
+  std::getline(ifs, result);
+  return result;
+}
+
+    bool is_turbo_enabled() {
+        return "0" == read_intel_pstate_no_turbo();
+    }
+
+    Result disable_cpu_turbo() {
+        write_intel_pstate_no_turbo("1");
+    }
+    Result enable_cpu_turbo() {
+        write_intel_pstate_no_turbo("1");
+    }
+
+
+}
--- a/include/perfect/detail/turbo/linux_power.hpp
+++ b/include/perfect/detail/turbo/linux_power.hpp
@@ -0,0 +1,47 @@
+#pragma once
+
+#include "perfect/result.hpp"
+
+namespace perfect {
+
+bool has_acpi_cpufreq_boost() {
+  return bool(std::ifstream("/sys/devices/system/cpu/cpufreq/boost"));
+}
+
+int write_acpi_cpufreq_boost(const std::string &s) {
+  assert(has_acpi_cpufreq_boost());
+  std::string path("/sys/devices/system/cpu/cpufreq/boost");
+  SPDLOG_LOGGER_TRACE(logger::console(), "writing to {}", path);
+  std::ofstream ofs(path, std::ofstream::out);
+  ofs << s;
+  ofs.close();
+  if (ofs.fail()) {
+    SPDLOG_LOGGER_TRACE(logger::console(), "error writing to {}", path);
+    return 1;
+  }
+  return 0;
+}
+
+std::string read_acpi_cpufeq_boost() {
+  assert(has_acpi_cpufreq_boost());
+  std::string path("/sys/devices/system/cpu/cpufreq/boost");
+  SPDLOG_LOGGER_TRACE(logger::console(), "reading {}", path);
+  std::ifstream ifs(path, std::ifstream::in);
+  std::string result;
+  std::getline(ifs, result);
+  return result;
+}
+
+    bool is_turbo_enabled() {
+        return "1" == read_acpi_cpufeq_boost();
+    }
+
+    Result disable_cpu_turbo() {
+        write_acpi_cpufeq_boost("0");
+    }
+
+    Result enable_cpu_turbo() {
+        write_acpi_cpufeq_boost("1");
+    }
+
+}
--- a/include/perfect/gpu_clocks.hpp
+++ b/include/perfect/gpu_clocks.hpp
@@ -0,0 +1 @@
+#pragma once
--- a/include/perfect/gpu_turbo.hpp
+++ b/include/perfect/gpu_turbo.hpp
@@ -0,0 +1 @@
+#pragma once
--- a/include/perfect/os_perf.hpp
+++ b/include/perfect/os_perf.hpp
@@ -0,0 +1,55 @@
+#pragma once
+
+
+
+#include <vector>
+#include <string>
+#include <cassert>
+
+
+
+#ifdef __linux__
+#include "detail/os/linux.hpp"
+#else
+#error "unsupported platform"
+#endif
+
+#include "result.hpp"
+
+namespace perfect {
+
+struct OsPerfState {
+#ifdef __linux__
+    std::string governor;
+#else
+#error "unsupported platform"
+#endif
+};
+
+Result get_os_perf_state(OsPerfState *state, const int cpu) {
+    assert(state);
+    #ifdef __linux__
+    return get_governor(state->governor, cpu);
+    #else
+    #error "unsupported platform"
+    #endif
+}
+
+Result os_perf_state_maximum(const int cpu) {
+    #ifdef __linux__
+    return set_governor(cpu, "performance");
+    #else
+    #error "unsupported platform"
+    #endif
+}
+
+Result set_os_perf_state(const int cpu, OsPerfState state) {
+        #ifdef __linux__
+    return set_governor(cpu, state.governor);
+    #else
+    #error "unsupported platform"
+    #endif
+
+}
+
+};
--- a/include/perfect/result.hpp
+++ b/include/perfect/result.hpp
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <cassert>
+
+namespace perfect {
+    enum class Result {
+        SUCCESS,
+        NVIDIA_ML,
+        NO_PERMISSION,
+        UNKNOWN
+    };
+
+const char * get_string(const Result &result) {
+    switch (result) {
+        case Result::SUCCESS: return "success";
+        case Result::NO_PERMISSION: return "no permission";
+        case Result::UNKNOWN: return "unknown error";
+        case Result::NVIDIA_ML: return "nvidia-ml error";
+        default: assert(0 && "unexpected perfect::Result");
+    }
+}
+
+}