initial commit
This commit is contained in:
45
CMakeLists.txt
Normal file
45
CMakeLists.txt
Normal file
@@ -0,0 +1,45 @@
|
||||
# 3.8+ for project(LANGUAGES CUDA)
|
||||
# 3.9+ for OpenMP::OpenMP_CXX
|
||||
# 3.10+ findopenmp gained support for language-specific components
|
||||
# 3.11+ for CMake not to add -fopenmp to the nvcc flags
|
||||
cmake_minimum_required(VERSION 3.11 FATAL_ERROR)
|
||||
|
||||
project(perfect LANGUAGES CUDA CXX VERSION 0.1.0)
|
||||
message(STATUS "Build type: " ${CMAKE_BUILD_TYPE})
|
||||
|
||||
#https://blog.kitware.com/cmake-and-the-default-build-type/
|
||||
# Set a default build type if none was specified
|
||||
set(default_build_type "Release")
|
||||
if(EXISTS "${CMAKE_SOURCE_DIR}/.git")
|
||||
set(default_build_type "Debug")
|
||||
endif()
|
||||
if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
|
||||
message(STATUS "Setting build type to '${default_build_type}' as none was specified.")
|
||||
set(CMAKE_BUILD_TYPE "${default_build_type}" CACHE
|
||||
STRING "Choose the type of build." FORCE)
|
||||
# Set the possible values of build type for cmake-gui
|
||||
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS
|
||||
"Debug" "Release" "MinSizeRel" "RelWithDebInfo")
|
||||
endif()
|
||||
|
||||
|
||||
if (CMAKE_BUILD_TYPE MATCHES Debug)
|
||||
message(STATUS "Setting verbose build during Debug")
|
||||
set(CMAKE_VERBOSE_MAKEFILE ON)
|
||||
set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} -G)
|
||||
elseif (CMAKE_BUILD_TYPE MATCHES Release)
|
||||
add_definitions(-DNDEBUG)
|
||||
set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} -lineinfo)
|
||||
endif()
|
||||
|
||||
|
||||
|
||||
set(CMAKE_CUDA_STANDARD 11)
|
||||
|
||||
add_subdirectory(include/perfect)
|
||||
|
||||
add_library(perfect INTERFACE)
|
||||
target_include_directories(perfect INTERFACE include/)
|
||||
|
||||
|
||||
add_subdirectory(examples)
|
17
README.md
17
README.md
@@ -1,2 +1,19 @@
|
||||
# perfect
|
||||
|
||||
CPU/GPU performance control library for benchmarking
|
||||
* x86
|
||||
* POWER
|
||||
* Nvidia
|
||||
|
||||
## Features
|
||||
|
||||
- [x] Disable CPU turbo (linux)
|
||||
- [x] Set OS CPU performance mode to maximum (linux)
|
||||
- [x] Set GPU clocks (nvidia)
|
||||
- [x] Disable GPU turbo (nvidia)
|
||||
- [x] Flush addresses from cache (amd64, POWER)
|
||||
|
||||
## Wish List
|
||||
|
||||
- [ ] Nvidia GPU power monitoring
|
||||
- [ ] Nivida GPU utilization monitoring
|
50
examples/CMakeLists.txt
Normal file
50
examples/CMakeLists.txt
Normal file
@@ -0,0 +1,50 @@
|
||||
# removed -Wredundant-decls for cuda 10.1
|
||||
# removed -Wundef for cuda 10.0
|
||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} \
|
||||
-Xcompiler=-Wall\
|
||||
-Xcompiler=-Wextra\
|
||||
-Xcompiler=-Wcast-qual \
|
||||
-Xcompiler=-Wcast-align \
|
||||
-Xcompiler=-Wstrict-aliasing \
|
||||
-Xcompiler=-Wpointer-arith \
|
||||
-Xcompiler=-Winit-self \
|
||||
-Xcompiler=-Wshadow \
|
||||
-Xcompiler=-Wswitch-enum \
|
||||
-Xcompiler=-Wfloat-equal \
|
||||
-Xcompiler=-Wvla\
|
||||
-Xcompiler=-fmax-errors=1 \
|
||||
-Xcompiler=-Wfatal-errors\
|
||||
")
|
||||
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} \
|
||||
-Wredundant-decls \
|
||||
-Wundef \
|
||||
-Wall\
|
||||
-Wextra\
|
||||
-Wcast-qual \
|
||||
-Wcast-align \
|
||||
-Wstrict-aliasing \
|
||||
-Wpointer-arith \
|
||||
-Winit-self \
|
||||
-Wshadow \
|
||||
-Wswitch-enum \
|
||||
-Wfloat-equal \
|
||||
-Wvla\
|
||||
-fmax-errors=1 \
|
||||
-Wfatal-errors\
|
||||
")
|
||||
|
||||
add_executable(cpu-cache cpu_cache.cpp)
|
||||
target_link_libraries(cpu-cache perfect)
|
||||
|
||||
add_executable(cpu-turbo cpu_turbo.cpp)
|
||||
target_link_libraries(cpu-turbo perfect)
|
||||
|
||||
add_executable(os-perf os_perf.cpp)
|
||||
target_link_libraries(os-perf perfect)
|
||||
|
||||
add_executable(gpu-clocks gpu_clocks.cu)
|
||||
target_link_libraries(gpu-clocks perfect)
|
||||
|
||||
add_executable(gpu-turbo gpu_turbo.cu)
|
||||
target_link_libraries(gpu-turbo perfect)
|
11
examples/cpu_cache.cpp
Normal file
11
examples/cpu_cache.cpp
Normal file
@@ -0,0 +1,11 @@
|
||||
#include "perfect/cpu_cache.hpp"
|
||||
|
||||
int main(void) {
|
||||
int *a = new int[1024];
|
||||
flush_all(a, 1024 * sizeof(int));
|
||||
|
||||
// do things with `a` flushed from cache into main memory
|
||||
// furthermore, all loads and stores before this function call are guaranteed to be complete
|
||||
|
||||
delete[] a;
|
||||
}
|
14
examples/cpu_turbo.cpp
Normal file
14
examples/cpu_turbo.cpp
Normal file
@@ -0,0 +1,14 @@
|
||||
#include "perfect/cpu_turbo.hpp"
|
||||
|
||||
int main(void) {
|
||||
|
||||
perfect::CpuTurboState state;
|
||||
perfect::get_cpu_turbo_state(&state);
|
||||
|
||||
perfect::disable_cpu_turbo();
|
||||
|
||||
// do things with CPU turbo disabled
|
||||
|
||||
perfect::set_cpu_turbo_state(state);
|
||||
|
||||
}
|
5
examples/gpu_clocks.cu
Normal file
5
examples/gpu_clocks.cu
Normal file
@@ -0,0 +1,5 @@
|
||||
#include "perfect/gpu_clocks.hpp"
|
||||
|
||||
int main(void) {
|
||||
|
||||
}
|
5
examples/gpu_turbo.cu
Normal file
5
examples/gpu_turbo.cu
Normal file
@@ -0,0 +1,5 @@
|
||||
#include "perfect/gpu_turbo.hpp"
|
||||
|
||||
int main(void) {
|
||||
|
||||
}
|
26
examples/os_perf.cpp
Normal file
26
examples/os_perf.cpp
Normal file
@@ -0,0 +1,26 @@
|
||||
#include "perfect/os_perf.hpp"
|
||||
|
||||
#include <map>
|
||||
|
||||
int main(void) {
|
||||
|
||||
|
||||
std::map<int, perfect::OsPerfState> states;
|
||||
|
||||
for (auto cpu : perfect::cpus()) {
|
||||
perfect::OsPerfState state;
|
||||
perfect::get_os_perf_state(&state, cpu);
|
||||
states[cpu] = state;
|
||||
perfect::os_perf_state_maximum(cpu);
|
||||
}
|
||||
|
||||
// do things with all CPUs set to the maximum performancem mode by the OS
|
||||
|
||||
for (auto kv : states) {
|
||||
int cpu = kv.first;
|
||||
perfect::OsPerfState state = kv.second;
|
||||
perfect::set_os_perf_state(cpu, state);
|
||||
}
|
||||
|
||||
|
||||
}
|
4
include/perfect/CMakeLists.txt
Normal file
4
include/perfect/CMakeLists.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
set(PERFECT_HEADERS ${PERFECT_HEADERS} cpu_cache.hpp PARENT_SCOPE)
|
||||
set(PERFECT_HEADERS ${PERFECT_HEADERS} cpu_turbo.hpp PARENT_SCOPE)
|
||||
set(PERFECT_HEADERS ${PERFECT_HEADERS} os_perf.hpp PARENT_SCOPE)
|
||||
set(PERFECT_HEADERS ${PERFECT_HEADERS} result.hpp PARENT_SCOPE)
|
126
include/perfect/cpu_cache.hpp
Normal file
126
include/perfect/cpu_cache.hpp
Normal file
@@ -0,0 +1,126 @@
|
||||
#pragma once
|
||||
|
||||
/*!
|
||||
Routines for controlling CPU caching
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdio>
|
||||
#include <iostream>
|
||||
|
||||
#ifdef __linux__
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
// https://gcc.gnu.org/onlinedocs/gcc/Simple-Constraints.html#Simple-Constraints
|
||||
|
||||
inline void flush_line(void *p) {
|
||||
#ifdef __powerpc__
|
||||
|
||||
/*
|
||||
PowerISA_V2.07B p. 773
|
||||
dcbf RA,RB,L
|
||||
|
||||
effective address is RA|0 + RB
|
||||
this mnemonic has L=0, which is through all cache levels
|
||||
write block to storage and mark as invalid in all processors
|
||||
*/
|
||||
|
||||
/*!
|
||||
|
||||
linux/arch/powerpc/include/asm/cache.h
|
||||
*/
|
||||
asm volatile("dcbf 0, %0"
|
||||
: // no outputs
|
||||
: "r"(p)
|
||||
: "memory");
|
||||
|
||||
#elif __amd64__
|
||||
|
||||
/*!
|
||||
|
||||
arch/x86/include/asm/special_insns.h
|
||||
|
||||
p139
|
||||
https://www.amd.com/system/files/TechDocs/24594.pdf
|
||||
|
||||
clflush mem8
|
||||
*/
|
||||
|
||||
asm volatile("clflush %0"
|
||||
: "+m"(p)
|
||||
: // no inputs
|
||||
: // no clobbers
|
||||
);
|
||||
#else
|
||||
#error "unsupported platform"
|
||||
(void)p;
|
||||
#endif
|
||||
}
|
||||
|
||||
inline void barrier_all() {
|
||||
|
||||
#ifdef __powerpc__
|
||||
|
||||
// sync is a mnemonic for sync 0, heavyweight sync
|
||||
asm volatile("sync"
|
||||
: // no outputs
|
||||
: // no inputs
|
||||
: "memory");
|
||||
|
||||
#elif __amd64__
|
||||
|
||||
asm volatile("mfence"
|
||||
: // no outputs
|
||||
: // no inputs
|
||||
: "memory");
|
||||
|
||||
#else
|
||||
#error "unsupported platform"
|
||||
#endif
|
||||
}
|
||||
|
||||
/*! return the smallest cache line size detected on the platform.
|
||||
Return 16 if the cache line size could not be detected.
|
||||
*/
|
||||
size_t cache_linesize() {
|
||||
#ifdef __linux__
|
||||
long linesize, var;
|
||||
|
||||
var = sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
|
||||
linesize = var;
|
||||
|
||||
var = sysconf(_SC_LEVEL2_CACHE_LINESIZE);
|
||||
linesize = var ? std::min(linesize, var) : linesize;
|
||||
|
||||
var = sysconf(_SC_LEVEL3_CACHE_LINESIZE);
|
||||
linesize = var ? std::min(linesize, var) : linesize;
|
||||
|
||||
var = sysconf(_SC_LEVEL4_CACHE_LINESIZE);
|
||||
linesize = var ? std::min(linesize, var) : linesize;
|
||||
|
||||
linesize = linesize ? linesize : 16;
|
||||
return linesize;
|
||||
#else
|
||||
#error "unsupported platform"
|
||||
#endif
|
||||
}
|
||||
|
||||
inline void flush_all(void *p, const size_t n) {
|
||||
|
||||
size_t lineSize = cache_linesize();
|
||||
|
||||
// cache flush may not be ordered wrt other kinds of accesses
|
||||
barrier_all();
|
||||
|
||||
for (size_t i = 0; i < n; i += lineSize) {
|
||||
char *c = static_cast<char *>(p);
|
||||
flush_line(&c[i]);
|
||||
}
|
||||
|
||||
// make flushing visible to other accesses
|
||||
barrier_all();
|
||||
}
|
||||
|
41
include/perfect/cpu_turbo.hpp
Normal file
41
include/perfect/cpu_turbo.hpp
Normal file
@@ -0,0 +1,41 @@
|
||||
#pragma once
|
||||
|
||||
#ifdef __linux__ // linux
|
||||
#include "detail/os/linux.hpp"
|
||||
|
||||
#ifdef __amd64__
|
||||
#include "detail/turbo/linux_amd64.hpp"
|
||||
#elif __powerpc64__
|
||||
#include "detail/turbo/linux_power.hpp"
|
||||
#else
|
||||
#error "unsupported CPU arch"
|
||||
#endif
|
||||
|
||||
#else // not linux
|
||||
#error "unsupported OS"
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
#include "result.hpp"
|
||||
|
||||
namespace perfect {
|
||||
|
||||
struct CpuTurboState {
|
||||
bool enabled;
|
||||
};
|
||||
|
||||
|
||||
Result get_cpu_turbo_state(CpuTurboState *state) {
|
||||
state->enabled = is_turbo_enabled();
|
||||
}
|
||||
|
||||
Result set_cpu_turbo_state(CpuTurboState state) {
|
||||
if (state.enabled) {
|
||||
enable_cpu_turbo();
|
||||
} else {
|
||||
disable_cpu_turbo();
|
||||
}
|
||||
}
|
||||
|
||||
};
|
51
include/perfect/detail/os/linux.hpp
Normal file
51
include/perfect/detail/os/linux.hpp
Normal file
@@ -0,0 +1,51 @@
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <fstream>
|
||||
#include <cassert>
|
||||
#include <vector>
|
||||
|
||||
#include <sched.h>
|
||||
#include <sys/types.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "perfect/result.hpp"
|
||||
|
||||
namespace perfect {
|
||||
|
||||
std::vector<int> cpus() {
|
||||
std::vector<int> result;
|
||||
cpu_set_t mask;
|
||||
if (sched_getaffinity(0 /*caller*/, sizeof(cpu_set_t), &mask)) {
|
||||
assert(0 && "failed sched_getaffinity");
|
||||
}
|
||||
for (int i = 0; i < CPU_SETSIZE; ++i) {
|
||||
if (CPU_ISSET(i, &mask)) {
|
||||
result.push_back(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Result get_governor(std::string &result, const int cpu) {
|
||||
std::string path("/sys/devices/system/cpu/cpu");
|
||||
path += std::to_string(cpu);
|
||||
path += "/cpufreq/scaling_governor";
|
||||
std::ifstream ifs(path, std::ifstream::in);
|
||||
std::getline(ifs, result);
|
||||
return Result::SUCCESS;
|
||||
}
|
||||
|
||||
Result set_governor(const int cpu, const std::string &governor) {
|
||||
std::string path("/sys/devices/system/cpu/cpu");
|
||||
path += std::to_string(cpu);
|
||||
path += "/cpufreq/scaling_governor";
|
||||
std::ofstream ofs(path, std::ofstream::out);
|
||||
ofs << governor;
|
||||
ofs.close();
|
||||
if (ofs.fail()) {
|
||||
return Result::NO_PERMISSION;
|
||||
}
|
||||
return Result::SUCCESS;
|
||||
}
|
||||
|
||||
}
|
50
include/perfect/detail/turbo/linux_amd64.hpp
Normal file
50
include/perfect/detail/turbo/linux_amd64.hpp
Normal file
@@ -0,0 +1,50 @@
|
||||
#pragma once
|
||||
|
||||
#include <cassert>
|
||||
#include <fstream>
|
||||
|
||||
#include "perfect/result.hpp"
|
||||
|
||||
namespace perfect {
|
||||
|
||||
bool has_intel_pstate_no_turbo() {
|
||||
return bool(std::ifstream("/sys/devices/system/cpu/intel_pstate/no_turbo"));
|
||||
}
|
||||
|
||||
int write_intel_pstate_no_turbo(const std::string &s) {
|
||||
assert(has_intel_pstate_no_turbo());
|
||||
std::string path("/sys/devices/system/cpu/intel_pstate/no_turbo");
|
||||
// SPDLOG_LOGGER_DEBUG(logger::console(), "writing {} to {}", s, path);
|
||||
std::ofstream ofs(path, std::ofstream::out);
|
||||
ofs << s;
|
||||
ofs.close();
|
||||
if (ofs.fail()) {
|
||||
// SPDLOG_LOGGER_DEBUG(logger::console(), "error writing {} to {}", s, path);
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
std::string read_intel_pstate_no_turbo() {
|
||||
assert(has_intel_pstate_no_turbo());
|
||||
std::string path("/sys/devices/system/cpu/intel_pstate/no_turbo");
|
||||
// SPDLOG_LOGGER_TRACE(logger::console(), "reading {}", path);
|
||||
std::ifstream ifs(path, std::ifstream::in);
|
||||
std::string result;
|
||||
std::getline(ifs, result);
|
||||
return result;
|
||||
}
|
||||
|
||||
bool is_turbo_enabled() {
|
||||
return "0" == read_intel_pstate_no_turbo();
|
||||
}
|
||||
|
||||
Result disable_cpu_turbo() {
|
||||
write_intel_pstate_no_turbo("1");
|
||||
}
|
||||
Result enable_cpu_turbo() {
|
||||
write_intel_pstate_no_turbo("1");
|
||||
}
|
||||
|
||||
|
||||
}
|
47
include/perfect/detail/turbo/linux_power.hpp
Normal file
47
include/perfect/detail/turbo/linux_power.hpp
Normal file
@@ -0,0 +1,47 @@
|
||||
#pragma once
|
||||
|
||||
#include "perfect/result.hpp"
|
||||
|
||||
namespace perfect {
|
||||
|
||||
bool has_acpi_cpufreq_boost() {
|
||||
return bool(std::ifstream("/sys/devices/system/cpu/cpufreq/boost"));
|
||||
}
|
||||
|
||||
int write_acpi_cpufreq_boost(const std::string &s) {
|
||||
assert(has_acpi_cpufreq_boost());
|
||||
std::string path("/sys/devices/system/cpu/cpufreq/boost");
|
||||
SPDLOG_LOGGER_TRACE(logger::console(), "writing to {}", path);
|
||||
std::ofstream ofs(path, std::ofstream::out);
|
||||
ofs << s;
|
||||
ofs.close();
|
||||
if (ofs.fail()) {
|
||||
SPDLOG_LOGGER_TRACE(logger::console(), "error writing to {}", path);
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
std::string read_acpi_cpufeq_boost() {
|
||||
assert(has_acpi_cpufreq_boost());
|
||||
std::string path("/sys/devices/system/cpu/cpufreq/boost");
|
||||
SPDLOG_LOGGER_TRACE(logger::console(), "reading {}", path);
|
||||
std::ifstream ifs(path, std::ifstream::in);
|
||||
std::string result;
|
||||
std::getline(ifs, result);
|
||||
return result;
|
||||
}
|
||||
|
||||
bool is_turbo_enabled() {
|
||||
return "1" == read_acpi_cpufeq_boost();
|
||||
}
|
||||
|
||||
Result disable_cpu_turbo() {
|
||||
write_acpi_cpufeq_boost("0");
|
||||
}
|
||||
|
||||
Result enable_cpu_turbo() {
|
||||
write_acpi_cpufeq_boost("1");
|
||||
}
|
||||
|
||||
}
|
1
include/perfect/gpu_clocks.hpp
Normal file
1
include/perfect/gpu_clocks.hpp
Normal file
@@ -0,0 +1 @@
|
||||
#pragma once
|
1
include/perfect/gpu_turbo.hpp
Normal file
1
include/perfect/gpu_turbo.hpp
Normal file
@@ -0,0 +1 @@
|
||||
#pragma once
|
55
include/perfect/os_perf.hpp
Normal file
55
include/perfect/os_perf.hpp
Normal file
@@ -0,0 +1,55 @@
|
||||
#pragma once
|
||||
|
||||
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <cassert>
|
||||
|
||||
|
||||
|
||||
#ifdef __linux__
|
||||
#include "detail/os/linux.hpp"
|
||||
#else
|
||||
#error "unsupported platform"
|
||||
#endif
|
||||
|
||||
#include "result.hpp"
|
||||
|
||||
namespace perfect {
|
||||
|
||||
struct OsPerfState {
|
||||
#ifdef __linux__
|
||||
std::string governor;
|
||||
#else
|
||||
#error "unsupported platform"
|
||||
#endif
|
||||
};
|
||||
|
||||
Result get_os_perf_state(OsPerfState *state, const int cpu) {
|
||||
assert(state);
|
||||
#ifdef __linux__
|
||||
return get_governor(state->governor, cpu);
|
||||
#else
|
||||
#error "unsupported platform"
|
||||
#endif
|
||||
}
|
||||
|
||||
Result os_perf_state_maximum(const int cpu) {
|
||||
#ifdef __linux__
|
||||
return set_governor(cpu, "performance");
|
||||
#else
|
||||
#error "unsupported platform"
|
||||
#endif
|
||||
}
|
||||
|
||||
Result set_os_perf_state(const int cpu, OsPerfState state) {
|
||||
#ifdef __linux__
|
||||
return set_governor(cpu, state.governor);
|
||||
#else
|
||||
#error "unsupported platform"
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
};
|
23
include/perfect/result.hpp
Normal file
23
include/perfect/result.hpp
Normal file
@@ -0,0 +1,23 @@
|
||||
#pragma once
|
||||
|
||||
#include <cassert>
|
||||
|
||||
namespace perfect {
|
||||
enum class Result {
|
||||
SUCCESS,
|
||||
NVIDIA_ML,
|
||||
NO_PERMISSION,
|
||||
UNKNOWN
|
||||
};
|
||||
|
||||
const char * get_string(const Result &result) {
|
||||
switch (result) {
|
||||
case Result::SUCCESS: return "success";
|
||||
case Result::NO_PERMISSION: return "no permission";
|
||||
case Result::UNKNOWN: return "unknown error";
|
||||
case Result::NVIDIA_ML: return "nvidia-ml error";
|
||||
default: assert(0 && "unexpected perfect::Result");
|
||||
}
|
||||
}
|
||||
|
||||
}
|
Reference in New Issue
Block a user