From f51ef904fbc6a9810c7c3de2e0beaf20a0883a7a Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Thu, 19 Sep 2019 10:59:28 -0500 Subject: [PATCH] initial commit --- CMakeLists.txt | 45 +++++++ README.md | 17 +++ examples/CMakeLists.txt | 50 ++++++++ examples/cpu_cache.cpp | 11 ++ examples/cpu_turbo.cpp | 14 +++ examples/gpu_clocks.cu | 5 + examples/gpu_turbo.cu | 5 + examples/os_perf.cpp | 26 ++++ include/perfect/CMakeLists.txt | 4 + include/perfect/cpu_cache.hpp | 126 +++++++++++++++++++ include/perfect/cpu_turbo.hpp | 41 ++++++ include/perfect/detail/os/linux.hpp | 51 ++++++++ include/perfect/detail/turbo/linux_amd64.hpp | 50 ++++++++ include/perfect/detail/turbo/linux_power.hpp | 47 +++++++ include/perfect/gpu_clocks.hpp | 1 + include/perfect/gpu_turbo.hpp | 1 + include/perfect/os_perf.hpp | 55 ++++++++ include/perfect/result.hpp | 23 ++++ 18 files changed, 572 insertions(+) create mode 100644 CMakeLists.txt create mode 100644 examples/CMakeLists.txt create mode 100644 examples/cpu_cache.cpp create mode 100644 examples/cpu_turbo.cpp create mode 100644 examples/gpu_clocks.cu create mode 100644 examples/gpu_turbo.cu create mode 100644 examples/os_perf.cpp create mode 100644 include/perfect/CMakeLists.txt create mode 100644 include/perfect/cpu_cache.hpp create mode 100644 include/perfect/cpu_turbo.hpp create mode 100644 include/perfect/detail/os/linux.hpp create mode 100644 include/perfect/detail/turbo/linux_amd64.hpp create mode 100644 include/perfect/detail/turbo/linux_power.hpp create mode 100644 include/perfect/gpu_clocks.hpp create mode 100644 include/perfect/gpu_turbo.hpp create mode 100644 include/perfect/os_perf.hpp create mode 100644 include/perfect/result.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..107637b --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,45 @@ +# 3.8+ for project(LANGUAGES CUDA) +# 3.9+ for OpenMP::OpenMP_CXX +# 3.10+ findopenmp gained support for language-specific components +# 3.11+ for CMake not to add -fopenmp to the nvcc flags +cmake_minimum_required(VERSION 3.11 FATAL_ERROR) + +project(perfect LANGUAGES CUDA CXX VERSION 0.1.0) +message(STATUS "Build type: " ${CMAKE_BUILD_TYPE}) + +#https://blog.kitware.com/cmake-and-the-default-build-type/ +# Set a default build type if none was specified +set(default_build_type "Release") +if(EXISTS "${CMAKE_SOURCE_DIR}/.git") + set(default_build_type "Debug") +endif() +if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) + message(STATUS "Setting build type to '${default_build_type}' as none was specified.") + set(CMAKE_BUILD_TYPE "${default_build_type}" CACHE + STRING "Choose the type of build." FORCE) + # Set the possible values of build type for cmake-gui + set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS + "Debug" "Release" "MinSizeRel" "RelWithDebInfo") +endif() + + +if (CMAKE_BUILD_TYPE MATCHES Debug) + message(STATUS "Setting verbose build during Debug") + set(CMAKE_VERBOSE_MAKEFILE ON) + set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} -G) +elseif (CMAKE_BUILD_TYPE MATCHES Release) + add_definitions(-DNDEBUG) + set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} -lineinfo) +endif() + + + +set(CMAKE_CUDA_STANDARD 11) + +add_subdirectory(include/perfect) + +add_library(perfect INTERFACE) +target_include_directories(perfect INTERFACE include/) + + +add_subdirectory(examples) \ No newline at end of file diff --git a/README.md b/README.md index 4627d4b..051f293 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,19 @@ # perfect + CPU/GPU performance control library for benchmarking +* x86 +* POWER +* Nvidia + +## Features + +- [x] Disable CPU turbo (linux) +- [x] Set OS CPU performance mode to maximum (linux) +- [x] Set GPU clocks (nvidia) +- [x] Disable GPU turbo (nvidia) +- [x] Flush addresses from cache (amd64, POWER) + +## Wish List + +- [ ] Nvidia GPU power monitoring +- [ ] Nivida GPU utilization monitoring \ No newline at end of file diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt new file mode 100644 index 0000000..fdd5364 --- /dev/null +++ b/examples/CMakeLists.txt @@ -0,0 +1,50 @@ +# removed -Wredundant-decls for cuda 10.1 +# removed -Wundef for cuda 10.0 +set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} \ + -Xcompiler=-Wall\ + -Xcompiler=-Wextra\ + -Xcompiler=-Wcast-qual \ + -Xcompiler=-Wcast-align \ + -Xcompiler=-Wstrict-aliasing \ + -Xcompiler=-Wpointer-arith \ + -Xcompiler=-Winit-self \ + -Xcompiler=-Wshadow \ + -Xcompiler=-Wswitch-enum \ + -Xcompiler=-Wfloat-equal \ + -Xcompiler=-Wvla\ + -Xcompiler=-fmax-errors=1 \ + -Xcompiler=-Wfatal-errors\ + ") + + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} \ + -Wredundant-decls \ + -Wundef \ + -Wall\ + -Wextra\ + -Wcast-qual \ + -Wcast-align \ + -Wstrict-aliasing \ + -Wpointer-arith \ + -Winit-self \ + -Wshadow \ + -Wswitch-enum \ + -Wfloat-equal \ + -Wvla\ + -fmax-errors=1 \ + -Wfatal-errors\ + ") + +add_executable(cpu-cache cpu_cache.cpp) +target_link_libraries(cpu-cache perfect) + +add_executable(cpu-turbo cpu_turbo.cpp) +target_link_libraries(cpu-turbo perfect) + +add_executable(os-perf os_perf.cpp) +target_link_libraries(os-perf perfect) + +add_executable(gpu-clocks gpu_clocks.cu) +target_link_libraries(gpu-clocks perfect) + +add_executable(gpu-turbo gpu_turbo.cu) +target_link_libraries(gpu-turbo perfect) \ No newline at end of file diff --git a/examples/cpu_cache.cpp b/examples/cpu_cache.cpp new file mode 100644 index 0000000..550240e --- /dev/null +++ b/examples/cpu_cache.cpp @@ -0,0 +1,11 @@ +#include "perfect/cpu_cache.hpp" + +int main(void) { + int *a = new int[1024]; + flush_all(a, 1024 * sizeof(int)); + + // do things with `a` flushed from cache into main memory + // furthermore, all loads and stores before this function call are guaranteed to be complete + + delete[] a; +} \ No newline at end of file diff --git a/examples/cpu_turbo.cpp b/examples/cpu_turbo.cpp new file mode 100644 index 0000000..e4d658b --- /dev/null +++ b/examples/cpu_turbo.cpp @@ -0,0 +1,14 @@ +#include "perfect/cpu_turbo.hpp" + +int main(void) { + + perfect::CpuTurboState state; + perfect::get_cpu_turbo_state(&state); + + perfect::disable_cpu_turbo(); + + // do things with CPU turbo disabled + + perfect::set_cpu_turbo_state(state); + +} \ No newline at end of file diff --git a/examples/gpu_clocks.cu b/examples/gpu_clocks.cu new file mode 100644 index 0000000..ae465f3 --- /dev/null +++ b/examples/gpu_clocks.cu @@ -0,0 +1,5 @@ +#include "perfect/gpu_clocks.hpp" + +int main(void) { + +} \ No newline at end of file diff --git a/examples/gpu_turbo.cu b/examples/gpu_turbo.cu new file mode 100644 index 0000000..4170786 --- /dev/null +++ b/examples/gpu_turbo.cu @@ -0,0 +1,5 @@ +#include "perfect/gpu_turbo.hpp" + +int main(void) { + +} \ No newline at end of file diff --git a/examples/os_perf.cpp b/examples/os_perf.cpp new file mode 100644 index 0000000..b8b59a6 --- /dev/null +++ b/examples/os_perf.cpp @@ -0,0 +1,26 @@ +#include "perfect/os_perf.hpp" + +#include + +int main(void) { + + + std::map states; + + for (auto cpu : perfect::cpus()) { + perfect::OsPerfState state; + perfect::get_os_perf_state(&state, cpu); + states[cpu] = state; + perfect::os_perf_state_maximum(cpu); + } + + // do things with all CPUs set to the maximum performancem mode by the OS + + for (auto kv : states) { + int cpu = kv.first; + perfect::OsPerfState state = kv.second; + perfect::set_os_perf_state(cpu, state); + } + + +} \ No newline at end of file diff --git a/include/perfect/CMakeLists.txt b/include/perfect/CMakeLists.txt new file mode 100644 index 0000000..2e796c3 --- /dev/null +++ b/include/perfect/CMakeLists.txt @@ -0,0 +1,4 @@ +set(PERFECT_HEADERS ${PERFECT_HEADERS} cpu_cache.hpp PARENT_SCOPE) +set(PERFECT_HEADERS ${PERFECT_HEADERS} cpu_turbo.hpp PARENT_SCOPE) +set(PERFECT_HEADERS ${PERFECT_HEADERS} os_perf.hpp PARENT_SCOPE) +set(PERFECT_HEADERS ${PERFECT_HEADERS} result.hpp PARENT_SCOPE) \ No newline at end of file diff --git a/include/perfect/cpu_cache.hpp b/include/perfect/cpu_cache.hpp new file mode 100644 index 0000000..68a8d77 --- /dev/null +++ b/include/perfect/cpu_cache.hpp @@ -0,0 +1,126 @@ +#pragma once + +/*! +Routines for controlling CPU caching +*/ + +#pragma once + +#include +#include +#include + +#ifdef __linux__ +#include +#endif + +// https://gcc.gnu.org/onlinedocs/gcc/Simple-Constraints.html#Simple-Constraints + +inline void flush_line(void *p) { +#ifdef __powerpc__ + + /* + PowerISA_V2.07B p. 773 + dcbf RA,RB,L + + effective address is RA|0 + RB + this mnemonic has L=0, which is through all cache levels + write block to storage and mark as invalid in all processors + */ + + /*! + + linux/arch/powerpc/include/asm/cache.h + */ + asm volatile("dcbf 0, %0" + : // no outputs + : "r"(p) + : "memory"); + +#elif __amd64__ + + /*! + + arch/x86/include/asm/special_insns.h + + p139 + https://www.amd.com/system/files/TechDocs/24594.pdf + + clflush mem8 + */ + + asm volatile("clflush %0" + : "+m"(p) + : // no inputs + : // no clobbers + ); +#else +#error "unsupported platform" + (void)p; +#endif +} + +inline void barrier_all() { + +#ifdef __powerpc__ + + // sync is a mnemonic for sync 0, heavyweight sync + asm volatile("sync" + : // no outputs + : // no inputs + : "memory"); + +#elif __amd64__ + + asm volatile("mfence" + : // no outputs + : // no inputs + : "memory"); + +#else +#error "unsupported platform" +#endif +} + +/*! return the smallest cache line size detected on the platform. +Return 16 if the cache line size could not be detected. +*/ +size_t cache_linesize() { +#ifdef __linux__ + long linesize, var; + + var = sysconf(_SC_LEVEL1_DCACHE_LINESIZE); + linesize = var; + + var = sysconf(_SC_LEVEL2_CACHE_LINESIZE); + linesize = var ? std::min(linesize, var) : linesize; + + var = sysconf(_SC_LEVEL3_CACHE_LINESIZE); + linesize = var ? std::min(linesize, var) : linesize; + + var = sysconf(_SC_LEVEL4_CACHE_LINESIZE); + linesize = var ? std::min(linesize, var) : linesize; + + linesize = linesize ? linesize : 16; + return linesize; +#else +#error "unsupported platform" +#endif +} + +inline void flush_all(void *p, const size_t n) { + + size_t lineSize = cache_linesize(); + + // cache flush may not be ordered wrt other kinds of accesses + barrier_all(); + + for (size_t i = 0; i < n; i += lineSize) { + char *c = static_cast(p); + flush_line(&c[i]); + } + + // make flushing visible to other accesses + barrier_all(); +} + diff --git a/include/perfect/cpu_turbo.hpp b/include/perfect/cpu_turbo.hpp new file mode 100644 index 0000000..96cc535 --- /dev/null +++ b/include/perfect/cpu_turbo.hpp @@ -0,0 +1,41 @@ +#pragma once + +#ifdef __linux__ // linux +#include "detail/os/linux.hpp" + +#ifdef __amd64__ +#include "detail/turbo/linux_amd64.hpp" +#elif __powerpc64__ +#include "detail/turbo/linux_power.hpp" +#else +#error "unsupported CPU arch" +#endif + +#else // not linux +#error "unsupported OS" +#endif + + + +#include "result.hpp" + +namespace perfect { + +struct CpuTurboState { + bool enabled; +}; + + +Result get_cpu_turbo_state(CpuTurboState *state) { + state->enabled = is_turbo_enabled(); +} + +Result set_cpu_turbo_state(CpuTurboState state) { + if (state.enabled) { + enable_cpu_turbo(); + } else { + disable_cpu_turbo(); + } +} + +}; \ No newline at end of file diff --git a/include/perfect/detail/os/linux.hpp b/include/perfect/detail/os/linux.hpp new file mode 100644 index 0000000..12f547a --- /dev/null +++ b/include/perfect/detail/os/linux.hpp @@ -0,0 +1,51 @@ +#pragma once + +#include +#include +#include +#include + +#include +#include +#include + +#include "perfect/result.hpp" + +namespace perfect { + +std::vector cpus() { + std::vector result; + cpu_set_t mask; + if (sched_getaffinity(0 /*caller*/, sizeof(cpu_set_t), &mask)) { + assert(0 && "failed sched_getaffinity"); + } + for (int i = 0; i < CPU_SETSIZE; ++i) { + if (CPU_ISSET(i, &mask)) { + result.push_back(i); + } + } +} + +Result get_governor(std::string &result, const int cpu) { + std::string path("/sys/devices/system/cpu/cpu"); + path += std::to_string(cpu); + path += "/cpufreq/scaling_governor"; + std::ifstream ifs(path, std::ifstream::in); + std::getline(ifs, result); + return Result::SUCCESS; +} + +Result set_governor(const int cpu, const std::string &governor) { + std::string path("/sys/devices/system/cpu/cpu"); + path += std::to_string(cpu); + path += "/cpufreq/scaling_governor"; + std::ofstream ofs(path, std::ofstream::out); + ofs << governor; + ofs.close(); + if (ofs.fail()) { + return Result::NO_PERMISSION; + } + return Result::SUCCESS; +} + +} \ No newline at end of file diff --git a/include/perfect/detail/turbo/linux_amd64.hpp b/include/perfect/detail/turbo/linux_amd64.hpp new file mode 100644 index 0000000..de6b615 --- /dev/null +++ b/include/perfect/detail/turbo/linux_amd64.hpp @@ -0,0 +1,50 @@ +#pragma once + +#include +#include + +#include "perfect/result.hpp" + +namespace perfect { + +bool has_intel_pstate_no_turbo() { + return bool(std::ifstream("/sys/devices/system/cpu/intel_pstate/no_turbo")); +} + +int write_intel_pstate_no_turbo(const std::string &s) { + assert(has_intel_pstate_no_turbo()); + std::string path("/sys/devices/system/cpu/intel_pstate/no_turbo"); +// SPDLOG_LOGGER_DEBUG(logger::console(), "writing {} to {}", s, path); + std::ofstream ofs(path, std::ofstream::out); + ofs << s; + ofs.close(); + if (ofs.fail()) { + // SPDLOG_LOGGER_DEBUG(logger::console(), "error writing {} to {}", s, path); + return 1; + } + return 0; +} + +std::string read_intel_pstate_no_turbo() { + assert(has_intel_pstate_no_turbo()); + std::string path("/sys/devices/system/cpu/intel_pstate/no_turbo"); +// SPDLOG_LOGGER_TRACE(logger::console(), "reading {}", path); + std::ifstream ifs(path, std::ifstream::in); + std::string result; + std::getline(ifs, result); + return result; +} + + bool is_turbo_enabled() { + return "0" == read_intel_pstate_no_turbo(); + } + + Result disable_cpu_turbo() { + write_intel_pstate_no_turbo("1"); + } + Result enable_cpu_turbo() { + write_intel_pstate_no_turbo("1"); + } + + +} \ No newline at end of file diff --git a/include/perfect/detail/turbo/linux_power.hpp b/include/perfect/detail/turbo/linux_power.hpp new file mode 100644 index 0000000..551ec45 --- /dev/null +++ b/include/perfect/detail/turbo/linux_power.hpp @@ -0,0 +1,47 @@ +#pragma once + +#include "perfect/result.hpp" + +namespace perfect { + +bool has_acpi_cpufreq_boost() { + return bool(std::ifstream("/sys/devices/system/cpu/cpufreq/boost")); +} + +int write_acpi_cpufreq_boost(const std::string &s) { + assert(has_acpi_cpufreq_boost()); + std::string path("/sys/devices/system/cpu/cpufreq/boost"); + SPDLOG_LOGGER_TRACE(logger::console(), "writing to {}", path); + std::ofstream ofs(path, std::ofstream::out); + ofs << s; + ofs.close(); + if (ofs.fail()) { + SPDLOG_LOGGER_TRACE(logger::console(), "error writing to {}", path); + return 1; + } + return 0; +} + +std::string read_acpi_cpufeq_boost() { + assert(has_acpi_cpufreq_boost()); + std::string path("/sys/devices/system/cpu/cpufreq/boost"); + SPDLOG_LOGGER_TRACE(logger::console(), "reading {}", path); + std::ifstream ifs(path, std::ifstream::in); + std::string result; + std::getline(ifs, result); + return result; +} + + bool is_turbo_enabled() { + return "1" == read_acpi_cpufeq_boost(); + } + + Result disable_cpu_turbo() { + write_acpi_cpufeq_boost("0"); + } + + Result enable_cpu_turbo() { + write_acpi_cpufeq_boost("1"); + } + +} \ No newline at end of file diff --git a/include/perfect/gpu_clocks.hpp b/include/perfect/gpu_clocks.hpp new file mode 100644 index 0000000..7b9637e --- /dev/null +++ b/include/perfect/gpu_clocks.hpp @@ -0,0 +1 @@ +#pragma once \ No newline at end of file diff --git a/include/perfect/gpu_turbo.hpp b/include/perfect/gpu_turbo.hpp new file mode 100644 index 0000000..7b9637e --- /dev/null +++ b/include/perfect/gpu_turbo.hpp @@ -0,0 +1 @@ +#pragma once \ No newline at end of file diff --git a/include/perfect/os_perf.hpp b/include/perfect/os_perf.hpp new file mode 100644 index 0000000..d3ba39d --- /dev/null +++ b/include/perfect/os_perf.hpp @@ -0,0 +1,55 @@ +#pragma once + + + +#include +#include +#include + + + +#ifdef __linux__ +#include "detail/os/linux.hpp" +#else +#error "unsupported platform" +#endif + +#include "result.hpp" + +namespace perfect { + +struct OsPerfState { +#ifdef __linux__ + std::string governor; +#else +#error "unsupported platform" +#endif +}; + +Result get_os_perf_state(OsPerfState *state, const int cpu) { + assert(state); + #ifdef __linux__ + return get_governor(state->governor, cpu); + #else + #error "unsupported platform" + #endif +} + +Result os_perf_state_maximum(const int cpu) { + #ifdef __linux__ + return set_governor(cpu, "performance"); + #else + #error "unsupported platform" + #endif +} + +Result set_os_perf_state(const int cpu, OsPerfState state) { + #ifdef __linux__ + return set_governor(cpu, state.governor); + #else + #error "unsupported platform" + #endif + +} + +}; \ No newline at end of file diff --git a/include/perfect/result.hpp b/include/perfect/result.hpp new file mode 100644 index 0000000..b2d93b0 --- /dev/null +++ b/include/perfect/result.hpp @@ -0,0 +1,23 @@ +#pragma once + +#include + +namespace perfect { + enum class Result { + SUCCESS, + NVIDIA_ML, + NO_PERMISSION, + UNKNOWN + }; + +const char * get_string(const Result &result) { + switch (result) { + case Result::SUCCESS: return "success"; + case Result::NO_PERMISSION: return "no permission"; + case Result::UNKNOWN: return "unknown error"; + case Result::NVIDIA_ML: return "nvidia-ml error"; + default: assert(0 && "unexpected perfect::Result"); + } +} + +}