work on gpu clocks and gpu turbo example

This commit is contained in:
Carl Pearson
2019-09-19 16:56:57 -05:00
parent 81cc7feafd
commit 13e6c8e03d
12 changed files with 194 additions and 136 deletions

View File

@@ -13,6 +13,10 @@ CPU/GPU performance control library for benchmarking
- [x] Disable GPU turbo (nvidia) - [x] Disable GPU turbo (nvidia)
- [x] Flush addresses from cache (amd64, POWER) - [x] Flush addresses from cache (amd64, POWER)
## API
## Wish List ## Wish List
- [ ] Nvidia GPU power monitoring - [ ] Nvidia GPU power monitoring

View File

@@ -1,6 +1,9 @@
#include "perfect/cpu_cache.hpp" #include "perfect/cpu_cache.hpp"
int main(void) { int main(void) {
using namespace perfect;
int *a = new int[1024]; int *a = new int[1024];
flush_all(a, 1024 * sizeof(int)); flush_all(a, 1024 * sizeof(int));

View File

@@ -1,5 +1,18 @@
#include <iostream>
#include "perfect/gpu_clocks.hpp" #include "perfect/gpu_clocks.hpp"
#include "perfect/init.hpp"
int main(void) { int main(void) {
using namespace perfect;
init();
for (unsigned int gpu = 0; gpu < 1; ++gpu) {
PERFECT(perfect::set_max_gpu_clocks(gpu));
PERFECT(perfect::reset_gpu_clocks(gpu));
}
return 0;
} }

View File

@@ -1,5 +1,18 @@
#include "perfect/gpu_turbo.hpp" #include "perfect/gpu_turbo.hpp"
#include "perfect/init.hpp"
#define OR_DIE(expr)
int main(void) { int main(void) {
using namespace perfect;
GpuTurboState state;
init();
for (unsigned int gpu = 0; gpu < 1; ++gpu) {
PERFECT(perfect::get_gpu_turbo_state(&state, gpu));
PERFECT(perfect::disable_gpu_turbo(gpu));
PERFECT(perfect::set_gpu_turbo_state(state, gpu));
}
} }

View File

@@ -11,116 +11,35 @@ Routines for controlling CPU caching
#include <iostream> #include <iostream>
#ifdef __linux__ #ifdef __linux__
#include <unistd.h> #include "detail/os/linux.hpp"
#endif
// https://gcc.gnu.org/onlinedocs/gcc/Simple-Constraints.html#Simple-Constraints
inline void flush_line(void *p) {
#ifdef __powerpc__
/*
PowerISA_V2.07B p. 773
dcbf RA,RB,L
effective address is RA|0 + RB
this mnemonic has L=0, which is through all cache levels
write block to storage and mark as invalid in all processors
*/
/*!
linux/arch/powerpc/include/asm/cache.h
*/
asm volatile("dcbf 0, %0"
: // no outputs
: "r"(p)
: "memory");
#elif __amd64__
/*!
arch/x86/include/asm/special_insns.h
p139
https://www.amd.com/system/files/TechDocs/24594.pdf
clflush mem8
*/
asm volatile("clflush %0"
: "+m"(p)
: // no inputs
: // no clobbers
);
#else #else
#error "unsupported platform" #error "unsupported OS"
(void)p;
#endif #endif
}
inline void barrier_all() {
#ifdef __powerpc__ #ifdef __powerpc__
#include "detail/cache/power.hpp"
// sync is a mnemonic for sync 0, heavyweight sync
asm volatile("sync"
: // no outputs
: // no inputs
: "memory");
#elif __amd64__ #elif __amd64__
#include "detail/cache/amd64.hpp"
asm volatile("mfence"
: // no outputs
: // no inputs
: "memory");
#else #else
#error "unsupported platform" #error "unsupported CPU arch"
#endif #endif
}
/*! return the smallest cache line size detected on the platform. namespace perfect {
Return 16 if the cache line size could not be detected.
*/
size_t cache_linesize() {
#ifdef __linux__
long linesize, var;
var = sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
linesize = var;
var = sysconf(_SC_LEVEL2_CACHE_LINESIZE);
linesize = var ? std::min(linesize, var) : linesize;
var = sysconf(_SC_LEVEL3_CACHE_LINESIZE);
linesize = var ? std::min(linesize, var) : linesize;
var = sysconf(_SC_LEVEL4_CACHE_LINESIZE);
linesize = var ? std::min(linesize, var) : linesize;
linesize = linesize ? linesize : 16;
return linesize;
#else
#error "unsupported platform"
#endif
}
inline void flush_all(void *p, const size_t n) { inline void flush_all(void *p, const size_t n) {
size_t lineSize = cache_linesize(); size_t lineSize = cache_linesize();
// cache flush may not be ordered wrt other kinds of accesses // cache flush may not be ordered wrt other kinds of accesses
barrier_all(); detail::barrier_all();
for (size_t i = 0; i < n; i += lineSize) { for (size_t i = 0; i < n; i += lineSize) {
char *c = static_cast<char *>(p); char *c = static_cast<char *>(p);
flush_line(&c[i]); detail::flush_line(&c[i]);
} }
// make flushing visible to other accesses // make flushing visible to other accesses
barrier_all(); detail::barrier_all();
} }
}

35
include/perfect/detail/cache/amd64.hpp vendored Normal file
View File

@@ -0,0 +1,35 @@
#pragma once
namespace perfect {
namespace detail {
inline void flush_line(void *p) {
/*!
arch/x86/include/asm/special_insns.h
p139
https://www.amd.com/system/files/TechDocs/24594.pdf
clflush mem8
*/
asm volatile("clflush %0"
: "+m"(p)
: // no inputs
: // no clobbers
);
}
inline void barrier_all() {
asm volatile("mfence"
: // no outputs
: // no inputs
: "memory");
}
}
}

35
include/perfect/detail/cache/power.hpp vendored Normal file
View File

@@ -0,0 +1,35 @@
#pragma once
inline void flush_line(void *p) {
/*
PowerISA_V2.07B p. 773
dcbf RA,RB,L
effective address is RA|0 + RB
this mnemonic has L=0, which is through all cache levels
write block to storage and mark as invalid in all processors
*/
/*!
linux/arch/powerpc/include/asm/cache.h
*/
asm volatile("dcbf 0, %0"
: // no outputs
: "r"(p)
: "memory");
}
inline void barrier_all() {
// sync is a mnemonic for sync 0, heavyweight sync
asm volatile("sync"
: // no outputs
: // no inputs
: "memory");
}

View File

@@ -18,19 +18,23 @@ inline void checkNvml(nvmlReturn_t result, const char *file, const int line) {
namespace perfect { namespace perfect {
namespace detail { namespace detail {
std::vector<unsigned int> get_device_memory_clocks(unsigned int index) {
std::vector<unsigned int> result;
nvmlDevice_t device;
nvmlDeviceGetHandleByIndex(index, &device);
unsigned int resultCount = 0;
auto ret = nvmlDeviceGetSupportedMemoryClocks(device, &resultCount, nullptr); Result get_device_memory_clocks(std::vector<unsigned int> &memoryClocksMhz, unsigned int index) {
if (ret != NVML_ERROR_INSUFFICIENT_SIZE) { nvmlDevice_t device;
NVML(ret); nvmlReturn_t ret;
ret = nvmlDeviceGetHandleByIndex(index, &device);
if (ret != NVML_SUCCESS) {
return from_nvml(ret);
} }
result.resize(resultCount);
NVML(nvmlDeviceGetSupportedMemoryClocks(device, &resultCount, result.data())); unsigned int resultCount = 0;
return result; ret = nvmlDeviceGetSupportedMemoryClocks(device, &resultCount, nullptr);
if (ret != NVML_ERROR_INSUFFICIENT_SIZE) {
return from_nvml(ret);
}
memoryClocksMhz.resize(resultCount);
NVML(nvmlDeviceGetSupportedMemoryClocks(device, &resultCount, memoryClocksMhz.data()));
return Result::SUCCESS;
} }
Result get_device_graphics_clocks(std::vector<unsigned int> &graphicsClocksMhz, Result get_device_graphics_clocks(std::vector<unsigned int> &graphicsClocksMhz,

View File

@@ -51,4 +51,30 @@ Result set_governor(const int cpu, const std::string &governor) {
return Result::SUCCESS; return Result::SUCCESS;
} }
/*! return the smallest cache line size detected on the platform.
Return 16 if the cache line size could not be detected.
*/
size_t cache_linesize() {
#ifdef __linux__
long linesize, var;
var = sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
linesize = var;
var = sysconf(_SC_LEVEL2_CACHE_LINESIZE);
linesize = var ? std::min(linesize, var) : linesize;
var = sysconf(_SC_LEVEL3_CACHE_LINESIZE);
linesize = var ? std::min(linesize, var) : linesize;
var = sysconf(_SC_LEVEL4_CACHE_LINESIZE);
linesize = var ? std::min(linesize, var) : linesize;
linesize = linesize ? linesize : 16;
return linesize;
#else
#error "unsupported platform"
#endif
}
} // namespace perfect } // namespace perfect

View File

@@ -1,5 +1,7 @@
#pragma once #pragma once
#include <algorithm>
#include "detail/nvidia/nvidia-ml.hpp" #include "detail/nvidia/nvidia-ml.hpp"
namespace perfect { namespace perfect {
@@ -8,22 +10,28 @@ namespace perfect {
*/ */
Result set_max_gpu_clocks(unsigned int idx) { Result set_max_gpu_clocks(unsigned int idx) {
Result rt; Result ret;
std::vector<unsigned int> clksMhz; std::vector<unsigned int> clksMhz;
nvmlDevice_t device;
ret = get_device_memory_clocks(clksMhz, idx); ret = from_nvml(nvmlDeviceGetHandleByIndex(idx, &device));
if (ret != Result::SUCCESS) {
auto maxMemMhz = *std::max_element(memClksMhz.begin(), memClksMhz.end()); return ret;
ret = get_device_graphics_clocks(clksMhz, idx);
auto maxCoreMhz = *std::max_element(clksMhz.begin(), clksMhz.end());
auto ret = nvmlDeviceSetApplicationsClocks(device, maxMemMhz, maxCoreMhz);
if (ret == NVML_ERROR_NOT_SUPPORTED) {
return Result::NVML_NOT_SUPPORTED;
} else if (ret == NVML_ERROR_NO_PERMISSION) {
return Result::NVML_NO_PERMISSION;
} }
return Result::SUCCESS;
ret = detail::get_device_memory_clocks(clksMhz, idx);
if (ret != Result::SUCCESS) {
return ret;
}
auto maxMemMhz = *std::max_element(clksMhz.begin(), clksMhz.end());
ret = detail::get_device_graphics_clocks(clksMhz, idx, maxMemMhz);
if (ret != Result::SUCCESS) {
return ret;
}
auto maxCoreMhz = *std::max_element(clksMhz.begin(), clksMhz.end());
return from_nvml(
nvmlDeviceSetApplicationsClocks(device, maxMemMhz, maxCoreMhz));
} }
/*! Reset GPU clocks to default behavior /*! Reset GPU clocks to default behavior
@@ -36,13 +44,7 @@ Result reset_gpu_clocks(unsigned int idx) {
if (ret != NVML_SUCCESS) { if (ret != NVML_SUCCESS) {
assert(false); assert(false);
} }
ret = nvmlDeviceResetApplicationsClocks(device); return from_nvml(nvmlDeviceResetApplicationsClocks(device));
if (ret == NVML_ERROR_NOT_SUPPORTED) {
return Result::NVML_NOT_SUPPORTED;
} else if (ret == NVML_ERROR_NO_PERMISSION) {
return Result::NVML_NO_PERMISSION;
}
return Result::SUCCESS;
} }
}; // namespace perfect }; // namespace perfect

View File

@@ -14,6 +14,10 @@ enum class Result {
UNKNOWN UNKNOWN
}; };
Result from_nvml(nvmlReturn_t nvml) { Result from_nvml(nvmlReturn_t nvml) {
switch (nvml) { switch (nvml) {
case NVML_SUCCESS: case NVML_SUCCESS:
@@ -51,4 +55,14 @@ const char *get_string(const Result &result) {
return ""; return "";
} }
inline void check(Result result, const char *file, const int line) {
if (result != Result::SUCCESS) {
fprintf(stderr, "%s@%d: perfect Error: %s\n", file, line,
get_string(result));
exit(-1);
}
}
} // namespace perfect } // namespace perfect
#define PERFECT(stmt) check(stmt, __FILE__, __LINE__);

View File

@@ -12,24 +12,14 @@ int main(void) {
perfect::init(); perfect::init();
ret = get_cpu_turbo_state(&state); PERFECT(get_cpu_turbo_state(&state));
if (ret != Result::SUCCESS) {
std::cerr << "ERROR: " << get_string(ret) << "\n";
exit(EXIT_FAILURE);
}
if (is_turbo_enabled(state)) { if (is_turbo_enabled(state)) {
std::cerr << "turbo already enabled\n"; std::cerr << "cpu turbo already enabled\n";
exit(EXIT_SUCCESS); exit(EXIT_SUCCESS);
} else { } else {
ret = enable_cpu_turbo(); PERFECT(enable_cpu_turbo());
if (ret != Result::SUCCESS) { std::cerr << "enabled cpu turbo\n";
std::cerr << "ERROR: " << get_string(ret) << "\n";
exit(EXIT_FAILURE);
} else {
std::cerr << "enabled turbo\n";
exit(EXIT_SUCCESS); exit(EXIT_SUCCESS);
}
} }
} }