From 13e6c8e03daa044f64797ab09939603efd4d3954 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Thu, 19 Sep 2019 16:56:57 -0500 Subject: [PATCH] work on gpu clocks and gpu turbo example --- README.md | 4 + examples/cpu_cache.cpp | 3 + examples/gpu_clocks.cu | 13 +++ examples/gpu_turbo.cu | 13 +++ include/perfect/cpu_cache.hpp | 101 ++------------------ include/perfect/detail/cache/amd64.hpp | 35 +++++++ include/perfect/detail/cache/power.hpp | 35 +++++++ include/perfect/detail/nvidia/nvidia-ml.hpp | 26 ++--- include/perfect/detail/os/linux.hpp | 26 +++++ include/perfect/gpu_clocks.hpp | 42 ++++---- include/perfect/result.hpp | 14 +++ tools/enable_turbo.cpp | 18 +--- 12 files changed, 194 insertions(+), 136 deletions(-) create mode 100644 include/perfect/detail/cache/amd64.hpp create mode 100644 include/perfect/detail/cache/power.hpp diff --git a/README.md b/README.md index 051f293..830d7a7 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,10 @@ CPU/GPU performance control library for benchmarking - [x] Disable GPU turbo (nvidia) - [x] Flush addresses from cache (amd64, POWER) +## API + + + ## Wish List - [ ] Nvidia GPU power monitoring diff --git a/examples/cpu_cache.cpp b/examples/cpu_cache.cpp index 550240e..79806c5 100644 --- a/examples/cpu_cache.cpp +++ b/examples/cpu_cache.cpp @@ -1,6 +1,9 @@ #include "perfect/cpu_cache.hpp" int main(void) { + + using namespace perfect; + int *a = new int[1024]; flush_all(a, 1024 * sizeof(int)); diff --git a/examples/gpu_clocks.cu b/examples/gpu_clocks.cu index ae465f3..3d2d2ef 100644 --- a/examples/gpu_clocks.cu +++ b/examples/gpu_clocks.cu @@ -1,5 +1,18 @@ +#include + #include "perfect/gpu_clocks.hpp" +#include "perfect/init.hpp" int main(void) { + using namespace perfect; + + init(); + + for (unsigned int gpu = 0; gpu < 1; ++gpu) { + PERFECT(perfect::set_max_gpu_clocks(gpu)); + PERFECT(perfect::reset_gpu_clocks(gpu)); + } + + return 0; } \ No newline at end of file diff --git a/examples/gpu_turbo.cu b/examples/gpu_turbo.cu index 4170786..dfd7fc0 100644 --- a/examples/gpu_turbo.cu +++ b/examples/gpu_turbo.cu @@ -1,5 +1,18 @@ #include "perfect/gpu_turbo.hpp" +#include "perfect/init.hpp" + +#define OR_DIE(expr) int main(void) { + using namespace perfect; + GpuTurboState state; + + init(); + + for (unsigned int gpu = 0; gpu < 1; ++gpu) { + PERFECT(perfect::get_gpu_turbo_state(&state, gpu)); + PERFECT(perfect::disable_gpu_turbo(gpu)); + PERFECT(perfect::set_gpu_turbo_state(state, gpu)); + } } \ No newline at end of file diff --git a/include/perfect/cpu_cache.hpp b/include/perfect/cpu_cache.hpp index 68a8d77..b28e85c 100644 --- a/include/perfect/cpu_cache.hpp +++ b/include/perfect/cpu_cache.hpp @@ -11,116 +11,35 @@ Routines for controlling CPU caching #include #ifdef __linux__ -#include -#endif - -// https://gcc.gnu.org/onlinedocs/gcc/Simple-Constraints.html#Simple-Constraints - -inline void flush_line(void *p) { -#ifdef __powerpc__ - - /* - PowerISA_V2.07B p. 773 - dcbf RA,RB,L - - effective address is RA|0 + RB - this mnemonic has L=0, which is through all cache levels - write block to storage and mark as invalid in all processors - */ - - /*! - - linux/arch/powerpc/include/asm/cache.h - */ - asm volatile("dcbf 0, %0" - : // no outputs - : "r"(p) - : "memory"); - -#elif __amd64__ - - /*! - - arch/x86/include/asm/special_insns.h - - p139 - https://www.amd.com/system/files/TechDocs/24594.pdf - - clflush mem8 - */ - - asm volatile("clflush %0" - : "+m"(p) - : // no inputs - : // no clobbers - ); +#include "detail/os/linux.hpp" #else -#error "unsupported platform" - (void)p; +#error "unsupported OS" #endif -} - -inline void barrier_all() { #ifdef __powerpc__ - - // sync is a mnemonic for sync 0, heavyweight sync - asm volatile("sync" - : // no outputs - : // no inputs - : "memory"); - +#include "detail/cache/power.hpp" #elif __amd64__ - - asm volatile("mfence" - : // no outputs - : // no inputs - : "memory"); - +#include "detail/cache/amd64.hpp" #else -#error "unsupported platform" +#error "unsupported CPU arch" #endif -} -/*! return the smallest cache line size detected on the platform. -Return 16 if the cache line size could not be detected. -*/ -size_t cache_linesize() { -#ifdef __linux__ - long linesize, var; - - var = sysconf(_SC_LEVEL1_DCACHE_LINESIZE); - linesize = var; - - var = sysconf(_SC_LEVEL2_CACHE_LINESIZE); - linesize = var ? std::min(linesize, var) : linesize; - - var = sysconf(_SC_LEVEL3_CACHE_LINESIZE); - linesize = var ? std::min(linesize, var) : linesize; - - var = sysconf(_SC_LEVEL4_CACHE_LINESIZE); - linesize = var ? std::min(linesize, var) : linesize; - - linesize = linesize ? linesize : 16; - return linesize; -#else -#error "unsupported platform" -#endif -} +namespace perfect { inline void flush_all(void *p, const size_t n) { size_t lineSize = cache_linesize(); // cache flush may not be ordered wrt other kinds of accesses - barrier_all(); + detail::barrier_all(); for (size_t i = 0; i < n; i += lineSize) { char *c = static_cast(p); - flush_line(&c[i]); + detail::flush_line(&c[i]); } // make flushing visible to other accesses - barrier_all(); + detail::barrier_all(); } +} \ No newline at end of file diff --git a/include/perfect/detail/cache/amd64.hpp b/include/perfect/detail/cache/amd64.hpp new file mode 100644 index 0000000..e0984aa --- /dev/null +++ b/include/perfect/detail/cache/amd64.hpp @@ -0,0 +1,35 @@ +#pragma once + +namespace perfect { + namespace detail { + + + +inline void flush_line(void *p) { + /*! + + arch/x86/include/asm/special_insns.h + + p139 + https://www.amd.com/system/files/TechDocs/24594.pdf + + clflush mem8 + */ + + asm volatile("clflush %0" + : "+m"(p) + : // no inputs + : // no clobbers + ); +} + +inline void barrier_all() { + + asm volatile("mfence" + : // no outputs + : // no inputs + : "memory"); +} + + } +} \ No newline at end of file diff --git a/include/perfect/detail/cache/power.hpp b/include/perfect/detail/cache/power.hpp new file mode 100644 index 0000000..3cec348 --- /dev/null +++ b/include/perfect/detail/cache/power.hpp @@ -0,0 +1,35 @@ +#pragma once + +inline void flush_line(void *p) { + + /* + PowerISA_V2.07B p. 773 + dcbf RA,RB,L + + effective address is RA|0 + RB + this mnemonic has L=0, which is through all cache levels + write block to storage and mark as invalid in all processors + */ + + /*! + + linux/arch/powerpc/include/asm/cache.h + */ + asm volatile("dcbf 0, %0" + : // no outputs + : "r"(p) + : "memory"); + + +} + +inline void barrier_all() { + + // sync is a mnemonic for sync 0, heavyweight sync + asm volatile("sync" + : // no outputs + : // no inputs + : "memory"); + + +} \ No newline at end of file diff --git a/include/perfect/detail/nvidia/nvidia-ml.hpp b/include/perfect/detail/nvidia/nvidia-ml.hpp index 1ac1dfb..40d2ca4 100644 --- a/include/perfect/detail/nvidia/nvidia-ml.hpp +++ b/include/perfect/detail/nvidia/nvidia-ml.hpp @@ -18,19 +18,23 @@ inline void checkNvml(nvmlReturn_t result, const char *file, const int line) { namespace perfect { namespace detail { -std::vector get_device_memory_clocks(unsigned int index) { - std::vector result; - nvmlDevice_t device; - nvmlDeviceGetHandleByIndex(index, &device); - unsigned int resultCount = 0; - auto ret = nvmlDeviceGetSupportedMemoryClocks(device, &resultCount, nullptr); - if (ret != NVML_ERROR_INSUFFICIENT_SIZE) { - NVML(ret); +Result get_device_memory_clocks(std::vector &memoryClocksMhz, unsigned int index) { + nvmlDevice_t device; + nvmlReturn_t ret; + ret = nvmlDeviceGetHandleByIndex(index, &device); + if (ret != NVML_SUCCESS) { + return from_nvml(ret); } - result.resize(resultCount); - NVML(nvmlDeviceGetSupportedMemoryClocks(device, &resultCount, result.data())); - return result; + + unsigned int resultCount = 0; + ret = nvmlDeviceGetSupportedMemoryClocks(device, &resultCount, nullptr); + if (ret != NVML_ERROR_INSUFFICIENT_SIZE) { + return from_nvml(ret); + } + memoryClocksMhz.resize(resultCount); + NVML(nvmlDeviceGetSupportedMemoryClocks(device, &resultCount, memoryClocksMhz.data())); + return Result::SUCCESS; } Result get_device_graphics_clocks(std::vector &graphicsClocksMhz, diff --git a/include/perfect/detail/os/linux.hpp b/include/perfect/detail/os/linux.hpp index 6568c70..8cbe4c7 100644 --- a/include/perfect/detail/os/linux.hpp +++ b/include/perfect/detail/os/linux.hpp @@ -51,4 +51,30 @@ Result set_governor(const int cpu, const std::string &governor) { return Result::SUCCESS; } +/*! return the smallest cache line size detected on the platform. +Return 16 if the cache line size could not be detected. +*/ +size_t cache_linesize() { +#ifdef __linux__ + long linesize, var; + + var = sysconf(_SC_LEVEL1_DCACHE_LINESIZE); + linesize = var; + + var = sysconf(_SC_LEVEL2_CACHE_LINESIZE); + linesize = var ? std::min(linesize, var) : linesize; + + var = sysconf(_SC_LEVEL3_CACHE_LINESIZE); + linesize = var ? std::min(linesize, var) : linesize; + + var = sysconf(_SC_LEVEL4_CACHE_LINESIZE); + linesize = var ? std::min(linesize, var) : linesize; + + linesize = linesize ? linesize : 16; + return linesize; +#else +#error "unsupported platform" +#endif +} + } // namespace perfect \ No newline at end of file diff --git a/include/perfect/gpu_clocks.hpp b/include/perfect/gpu_clocks.hpp index e719b8d..6cf4226 100644 --- a/include/perfect/gpu_clocks.hpp +++ b/include/perfect/gpu_clocks.hpp @@ -1,5 +1,7 @@ #pragma once +#include + #include "detail/nvidia/nvidia-ml.hpp" namespace perfect { @@ -8,22 +10,28 @@ namespace perfect { */ Result set_max_gpu_clocks(unsigned int idx) { - Result rt; + Result ret; std::vector clksMhz; + nvmlDevice_t device; - ret = get_device_memory_clocks(clksMhz, idx); - - auto maxMemMhz = *std::max_element(memClksMhz.begin(), memClksMhz.end()); - ret = get_device_graphics_clocks(clksMhz, idx); - auto maxCoreMhz = *std::max_element(clksMhz.begin(), clksMhz.end()); - - auto ret = nvmlDeviceSetApplicationsClocks(device, maxMemMhz, maxCoreMhz); - if (ret == NVML_ERROR_NOT_SUPPORTED) { - return Result::NVML_NOT_SUPPORTED; - } else if (ret == NVML_ERROR_NO_PERMISSION) { - return Result::NVML_NO_PERMISSION; + ret = from_nvml(nvmlDeviceGetHandleByIndex(idx, &device)); + if (ret != Result::SUCCESS) { + return ret; } - return Result::SUCCESS; + + ret = detail::get_device_memory_clocks(clksMhz, idx); + if (ret != Result::SUCCESS) { + return ret; + } + + auto maxMemMhz = *std::max_element(clksMhz.begin(), clksMhz.end()); + ret = detail::get_device_graphics_clocks(clksMhz, idx, maxMemMhz); + if (ret != Result::SUCCESS) { + return ret; + } + auto maxCoreMhz = *std::max_element(clksMhz.begin(), clksMhz.end()); + return from_nvml( + nvmlDeviceSetApplicationsClocks(device, maxMemMhz, maxCoreMhz)); } /*! Reset GPU clocks to default behavior @@ -36,13 +44,7 @@ Result reset_gpu_clocks(unsigned int idx) { if (ret != NVML_SUCCESS) { assert(false); } - ret = nvmlDeviceResetApplicationsClocks(device); - if (ret == NVML_ERROR_NOT_SUPPORTED) { - return Result::NVML_NOT_SUPPORTED; - } else if (ret == NVML_ERROR_NO_PERMISSION) { - return Result::NVML_NO_PERMISSION; - } - return Result::SUCCESS; + return from_nvml(nvmlDeviceResetApplicationsClocks(device)); } }; // namespace perfect \ No newline at end of file diff --git a/include/perfect/result.hpp b/include/perfect/result.hpp index 5cf3ba4..b0c6fce 100644 --- a/include/perfect/result.hpp +++ b/include/perfect/result.hpp @@ -14,6 +14,10 @@ enum class Result { UNKNOWN }; + + + + Result from_nvml(nvmlReturn_t nvml) { switch (nvml) { case NVML_SUCCESS: @@ -51,4 +55,14 @@ const char *get_string(const Result &result) { return ""; } +inline void check(Result result, const char *file, const int line) { + if (result != Result::SUCCESS) { + fprintf(stderr, "%s@%d: perfect Error: %s\n", file, line, + get_string(result)); + exit(-1); + } +} + } // namespace perfect + +#define PERFECT(stmt) check(stmt, __FILE__, __LINE__); diff --git a/tools/enable_turbo.cpp b/tools/enable_turbo.cpp index 208a8e4..ca97206 100644 --- a/tools/enable_turbo.cpp +++ b/tools/enable_turbo.cpp @@ -12,24 +12,14 @@ int main(void) { perfect::init(); - ret = get_cpu_turbo_state(&state); - - if (ret != Result::SUCCESS) { - std::cerr << "ERROR: " << get_string(ret) << "\n"; - exit(EXIT_FAILURE); - } + PERFECT(get_cpu_turbo_state(&state)); if (is_turbo_enabled(state)) { - std::cerr << "turbo already enabled\n"; + std::cerr << "cpu turbo already enabled\n"; exit(EXIT_SUCCESS); } else { - ret = enable_cpu_turbo(); - if (ret != Result::SUCCESS) { - std::cerr << "ERROR: " << get_string(ret) << "\n"; - exit(EXIT_FAILURE); - } else { - std::cerr << "enabled turbo\n"; + PERFECT(enable_cpu_turbo()); + std::cerr << "enabled cpu turbo\n"; exit(EXIT_SUCCESS); - } } } \ No newline at end of file