work on gpu clocks and gpu turbo example
This commit is contained in:
@@ -13,6 +13,10 @@ CPU/GPU performance control library for benchmarking
|
||||
- [x] Disable GPU turbo (nvidia)
|
||||
- [x] Flush addresses from cache (amd64, POWER)
|
||||
|
||||
## API
|
||||
|
||||
|
||||
|
||||
## Wish List
|
||||
|
||||
- [ ] Nvidia GPU power monitoring
|
||||
|
@@ -1,6 +1,9 @@
|
||||
#include "perfect/cpu_cache.hpp"
|
||||
|
||||
int main(void) {
|
||||
|
||||
using namespace perfect;
|
||||
|
||||
int *a = new int[1024];
|
||||
flush_all(a, 1024 * sizeof(int));
|
||||
|
||||
|
@@ -1,5 +1,18 @@
|
||||
#include <iostream>
|
||||
|
||||
#include "perfect/gpu_clocks.hpp"
|
||||
#include "perfect/init.hpp"
|
||||
|
||||
int main(void) {
|
||||
|
||||
using namespace perfect;
|
||||
|
||||
init();
|
||||
|
||||
for (unsigned int gpu = 0; gpu < 1; ++gpu) {
|
||||
PERFECT(perfect::set_max_gpu_clocks(gpu));
|
||||
PERFECT(perfect::reset_gpu_clocks(gpu));
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
@@ -1,5 +1,18 @@
|
||||
#include "perfect/gpu_turbo.hpp"
|
||||
#include "perfect/init.hpp"
|
||||
|
||||
#define OR_DIE(expr)
|
||||
|
||||
int main(void) {
|
||||
|
||||
using namespace perfect;
|
||||
GpuTurboState state;
|
||||
|
||||
init();
|
||||
|
||||
for (unsigned int gpu = 0; gpu < 1; ++gpu) {
|
||||
PERFECT(perfect::get_gpu_turbo_state(&state, gpu));
|
||||
PERFECT(perfect::disable_gpu_turbo(gpu));
|
||||
PERFECT(perfect::set_gpu_turbo_state(state, gpu));
|
||||
}
|
||||
}
|
@@ -11,116 +11,35 @@ Routines for controlling CPU caching
|
||||
#include <iostream>
|
||||
|
||||
#ifdef __linux__
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
// https://gcc.gnu.org/onlinedocs/gcc/Simple-Constraints.html#Simple-Constraints
|
||||
|
||||
inline void flush_line(void *p) {
|
||||
#ifdef __powerpc__
|
||||
|
||||
/*
|
||||
PowerISA_V2.07B p. 773
|
||||
dcbf RA,RB,L
|
||||
|
||||
effective address is RA|0 + RB
|
||||
this mnemonic has L=0, which is through all cache levels
|
||||
write block to storage and mark as invalid in all processors
|
||||
*/
|
||||
|
||||
/*!
|
||||
|
||||
linux/arch/powerpc/include/asm/cache.h
|
||||
*/
|
||||
asm volatile("dcbf 0, %0"
|
||||
: // no outputs
|
||||
: "r"(p)
|
||||
: "memory");
|
||||
|
||||
#elif __amd64__
|
||||
|
||||
/*!
|
||||
|
||||
arch/x86/include/asm/special_insns.h
|
||||
|
||||
p139
|
||||
https://www.amd.com/system/files/TechDocs/24594.pdf
|
||||
|
||||
clflush mem8
|
||||
*/
|
||||
|
||||
asm volatile("clflush %0"
|
||||
: "+m"(p)
|
||||
: // no inputs
|
||||
: // no clobbers
|
||||
);
|
||||
#include "detail/os/linux.hpp"
|
||||
#else
|
||||
#error "unsupported platform"
|
||||
(void)p;
|
||||
#error "unsupported OS"
|
||||
#endif
|
||||
}
|
||||
|
||||
inline void barrier_all() {
|
||||
|
||||
#ifdef __powerpc__
|
||||
|
||||
// sync is a mnemonic for sync 0, heavyweight sync
|
||||
asm volatile("sync"
|
||||
: // no outputs
|
||||
: // no inputs
|
||||
: "memory");
|
||||
|
||||
#include "detail/cache/power.hpp"
|
||||
#elif __amd64__
|
||||
|
||||
asm volatile("mfence"
|
||||
: // no outputs
|
||||
: // no inputs
|
||||
: "memory");
|
||||
|
||||
#include "detail/cache/amd64.hpp"
|
||||
#else
|
||||
#error "unsupported platform"
|
||||
#error "unsupported CPU arch"
|
||||
#endif
|
||||
}
|
||||
|
||||
/*! return the smallest cache line size detected on the platform.
|
||||
Return 16 if the cache line size could not be detected.
|
||||
*/
|
||||
size_t cache_linesize() {
|
||||
#ifdef __linux__
|
||||
long linesize, var;
|
||||
|
||||
var = sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
|
||||
linesize = var;
|
||||
|
||||
var = sysconf(_SC_LEVEL2_CACHE_LINESIZE);
|
||||
linesize = var ? std::min(linesize, var) : linesize;
|
||||
|
||||
var = sysconf(_SC_LEVEL3_CACHE_LINESIZE);
|
||||
linesize = var ? std::min(linesize, var) : linesize;
|
||||
|
||||
var = sysconf(_SC_LEVEL4_CACHE_LINESIZE);
|
||||
linesize = var ? std::min(linesize, var) : linesize;
|
||||
|
||||
linesize = linesize ? linesize : 16;
|
||||
return linesize;
|
||||
#else
|
||||
#error "unsupported platform"
|
||||
#endif
|
||||
}
|
||||
namespace perfect {
|
||||
|
||||
inline void flush_all(void *p, const size_t n) {
|
||||
|
||||
size_t lineSize = cache_linesize();
|
||||
|
||||
// cache flush may not be ordered wrt other kinds of accesses
|
||||
barrier_all();
|
||||
detail::barrier_all();
|
||||
|
||||
for (size_t i = 0; i < n; i += lineSize) {
|
||||
char *c = static_cast<char *>(p);
|
||||
flush_line(&c[i]);
|
||||
detail::flush_line(&c[i]);
|
||||
}
|
||||
|
||||
// make flushing visible to other accesses
|
||||
barrier_all();
|
||||
detail::barrier_all();
|
||||
}
|
||||
|
||||
}
|
35
include/perfect/detail/cache/amd64.hpp
vendored
Normal file
35
include/perfect/detail/cache/amd64.hpp
vendored
Normal file
@@ -0,0 +1,35 @@
|
||||
#pragma once
|
||||
|
||||
namespace perfect {
|
||||
namespace detail {
|
||||
|
||||
|
||||
|
||||
inline void flush_line(void *p) {
|
||||
/*!
|
||||
|
||||
arch/x86/include/asm/special_insns.h
|
||||
|
||||
p139
|
||||
https://www.amd.com/system/files/TechDocs/24594.pdf
|
||||
|
||||
clflush mem8
|
||||
*/
|
||||
|
||||
asm volatile("clflush %0"
|
||||
: "+m"(p)
|
||||
: // no inputs
|
||||
: // no clobbers
|
||||
);
|
||||
}
|
||||
|
||||
inline void barrier_all() {
|
||||
|
||||
asm volatile("mfence"
|
||||
: // no outputs
|
||||
: // no inputs
|
||||
: "memory");
|
||||
}
|
||||
|
||||
}
|
||||
}
|
35
include/perfect/detail/cache/power.hpp
vendored
Normal file
35
include/perfect/detail/cache/power.hpp
vendored
Normal file
@@ -0,0 +1,35 @@
|
||||
#pragma once
|
||||
|
||||
inline void flush_line(void *p) {
|
||||
|
||||
/*
|
||||
PowerISA_V2.07B p. 773
|
||||
dcbf RA,RB,L
|
||||
|
||||
effective address is RA|0 + RB
|
||||
this mnemonic has L=0, which is through all cache levels
|
||||
write block to storage and mark as invalid in all processors
|
||||
*/
|
||||
|
||||
/*!
|
||||
|
||||
linux/arch/powerpc/include/asm/cache.h
|
||||
*/
|
||||
asm volatile("dcbf 0, %0"
|
||||
: // no outputs
|
||||
: "r"(p)
|
||||
: "memory");
|
||||
|
||||
|
||||
}
|
||||
|
||||
inline void barrier_all() {
|
||||
|
||||
// sync is a mnemonic for sync 0, heavyweight sync
|
||||
asm volatile("sync"
|
||||
: // no outputs
|
||||
: // no inputs
|
||||
: "memory");
|
||||
|
||||
|
||||
}
|
@@ -18,19 +18,23 @@ inline void checkNvml(nvmlReturn_t result, const char *file, const int line) {
|
||||
|
||||
namespace perfect {
|
||||
namespace detail {
|
||||
std::vector<unsigned int> get_device_memory_clocks(unsigned int index) {
|
||||
std::vector<unsigned int> result;
|
||||
nvmlDevice_t device;
|
||||
nvmlDeviceGetHandleByIndex(index, &device);
|
||||
unsigned int resultCount = 0;
|
||||
|
||||
auto ret = nvmlDeviceGetSupportedMemoryClocks(device, &resultCount, nullptr);
|
||||
if (ret != NVML_ERROR_INSUFFICIENT_SIZE) {
|
||||
NVML(ret);
|
||||
Result get_device_memory_clocks(std::vector<unsigned int> &memoryClocksMhz, unsigned int index) {
|
||||
nvmlDevice_t device;
|
||||
nvmlReturn_t ret;
|
||||
ret = nvmlDeviceGetHandleByIndex(index, &device);
|
||||
if (ret != NVML_SUCCESS) {
|
||||
return from_nvml(ret);
|
||||
}
|
||||
result.resize(resultCount);
|
||||
NVML(nvmlDeviceGetSupportedMemoryClocks(device, &resultCount, result.data()));
|
||||
return result;
|
||||
|
||||
unsigned int resultCount = 0;
|
||||
ret = nvmlDeviceGetSupportedMemoryClocks(device, &resultCount, nullptr);
|
||||
if (ret != NVML_ERROR_INSUFFICIENT_SIZE) {
|
||||
return from_nvml(ret);
|
||||
}
|
||||
memoryClocksMhz.resize(resultCount);
|
||||
NVML(nvmlDeviceGetSupportedMemoryClocks(device, &resultCount, memoryClocksMhz.data()));
|
||||
return Result::SUCCESS;
|
||||
}
|
||||
|
||||
Result get_device_graphics_clocks(std::vector<unsigned int> &graphicsClocksMhz,
|
||||
|
@@ -51,4 +51,30 @@ Result set_governor(const int cpu, const std::string &governor) {
|
||||
return Result::SUCCESS;
|
||||
}
|
||||
|
||||
/*! return the smallest cache line size detected on the platform.
|
||||
Return 16 if the cache line size could not be detected.
|
||||
*/
|
||||
size_t cache_linesize() {
|
||||
#ifdef __linux__
|
||||
long linesize, var;
|
||||
|
||||
var = sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
|
||||
linesize = var;
|
||||
|
||||
var = sysconf(_SC_LEVEL2_CACHE_LINESIZE);
|
||||
linesize = var ? std::min(linesize, var) : linesize;
|
||||
|
||||
var = sysconf(_SC_LEVEL3_CACHE_LINESIZE);
|
||||
linesize = var ? std::min(linesize, var) : linesize;
|
||||
|
||||
var = sysconf(_SC_LEVEL4_CACHE_LINESIZE);
|
||||
linesize = var ? std::min(linesize, var) : linesize;
|
||||
|
||||
linesize = linesize ? linesize : 16;
|
||||
return linesize;
|
||||
#else
|
||||
#error "unsupported platform"
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace perfect
|
@@ -1,5 +1,7 @@
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
#include "detail/nvidia/nvidia-ml.hpp"
|
||||
|
||||
namespace perfect {
|
||||
@@ -8,22 +10,28 @@ namespace perfect {
|
||||
*/
|
||||
Result set_max_gpu_clocks(unsigned int idx) {
|
||||
|
||||
Result rt;
|
||||
Result ret;
|
||||
std::vector<unsigned int> clksMhz;
|
||||
nvmlDevice_t device;
|
||||
|
||||
ret = get_device_memory_clocks(clksMhz, idx);
|
||||
|
||||
auto maxMemMhz = *std::max_element(memClksMhz.begin(), memClksMhz.end());
|
||||
ret = get_device_graphics_clocks(clksMhz, idx);
|
||||
auto maxCoreMhz = *std::max_element(clksMhz.begin(), clksMhz.end());
|
||||
|
||||
auto ret = nvmlDeviceSetApplicationsClocks(device, maxMemMhz, maxCoreMhz);
|
||||
if (ret == NVML_ERROR_NOT_SUPPORTED) {
|
||||
return Result::NVML_NOT_SUPPORTED;
|
||||
} else if (ret == NVML_ERROR_NO_PERMISSION) {
|
||||
return Result::NVML_NO_PERMISSION;
|
||||
ret = from_nvml(nvmlDeviceGetHandleByIndex(idx, &device));
|
||||
if (ret != Result::SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
return Result::SUCCESS;
|
||||
|
||||
ret = detail::get_device_memory_clocks(clksMhz, idx);
|
||||
if (ret != Result::SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
auto maxMemMhz = *std::max_element(clksMhz.begin(), clksMhz.end());
|
||||
ret = detail::get_device_graphics_clocks(clksMhz, idx, maxMemMhz);
|
||||
if (ret != Result::SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
auto maxCoreMhz = *std::max_element(clksMhz.begin(), clksMhz.end());
|
||||
return from_nvml(
|
||||
nvmlDeviceSetApplicationsClocks(device, maxMemMhz, maxCoreMhz));
|
||||
}
|
||||
|
||||
/*! Reset GPU clocks to default behavior
|
||||
@@ -36,13 +44,7 @@ Result reset_gpu_clocks(unsigned int idx) {
|
||||
if (ret != NVML_SUCCESS) {
|
||||
assert(false);
|
||||
}
|
||||
ret = nvmlDeviceResetApplicationsClocks(device);
|
||||
if (ret == NVML_ERROR_NOT_SUPPORTED) {
|
||||
return Result::NVML_NOT_SUPPORTED;
|
||||
} else if (ret == NVML_ERROR_NO_PERMISSION) {
|
||||
return Result::NVML_NO_PERMISSION;
|
||||
}
|
||||
return Result::SUCCESS;
|
||||
return from_nvml(nvmlDeviceResetApplicationsClocks(device));
|
||||
}
|
||||
|
||||
}; // namespace perfect
|
@@ -14,6 +14,10 @@ enum class Result {
|
||||
UNKNOWN
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Result from_nvml(nvmlReturn_t nvml) {
|
||||
switch (nvml) {
|
||||
case NVML_SUCCESS:
|
||||
@@ -51,4 +55,14 @@ const char *get_string(const Result &result) {
|
||||
return "";
|
||||
}
|
||||
|
||||
inline void check(Result result, const char *file, const int line) {
|
||||
if (result != Result::SUCCESS) {
|
||||
fprintf(stderr, "%s@%d: perfect Error: %s\n", file, line,
|
||||
get_string(result));
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace perfect
|
||||
|
||||
#define PERFECT(stmt) check(stmt, __FILE__, __LINE__);
|
||||
|
@@ -12,24 +12,14 @@ int main(void) {
|
||||
|
||||
perfect::init();
|
||||
|
||||
ret = get_cpu_turbo_state(&state);
|
||||
|
||||
if (ret != Result::SUCCESS) {
|
||||
std::cerr << "ERROR: " << get_string(ret) << "\n";
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
PERFECT(get_cpu_turbo_state(&state));
|
||||
|
||||
if (is_turbo_enabled(state)) {
|
||||
std::cerr << "turbo already enabled\n";
|
||||
std::cerr << "cpu turbo already enabled\n";
|
||||
exit(EXIT_SUCCESS);
|
||||
} else {
|
||||
ret = enable_cpu_turbo();
|
||||
if (ret != Result::SUCCESS) {
|
||||
std::cerr << "ERROR: " << get_string(ret) << "\n";
|
||||
exit(EXIT_FAILURE);
|
||||
} else {
|
||||
std::cerr << "enabled turbo\n";
|
||||
PERFECT(enable_cpu_turbo());
|
||||
std::cerr << "enabled cpu turbo\n";
|
||||
exit(EXIT_SUCCESS);
|
||||
}
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user