diff --git a/README.md b/README.md
index cfcf41c..21f04fe 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,7 @@ CPU/GPU performance control library for benchmarking
 
 ## Features
 
+- [x] GPU power/utilization/temperature monitoring (nvidia)
 - [x] Disable CPU turbo (linux)
 - [x] Set OS CPU performance mode to maximum (linux)
 - [x] Set GPU clocks (nvidia)
@@ -67,6 +68,22 @@ perfect::CpuTurboState state;
 PERFECT(perfect::get_cpu_turbo_state(&state));
 ```
 
+## Monitoring
+
+`perfect` can monitor and record GPU activity.
+
+See [examples/gpu_monitor.cu](examples/gpu_monitor.cu)
+
+```c++
+#include "perfect/cpu_monitor.hpp"
+```
+
+* `Monitor(std::ostream *stream)`: create a monitor that will write to `stream`.
+* `void Monitor::start()`: start the monitor
+* `void Monitor::stop()`: terminate the monitor
+* `void Monitor::pause()`: pause the monitor thread
+* `void Monitor::resume()`: resume the monitor thread
+
 ### CPU Turbo
 
 `perfect` can enable and disable CPU boost through the Intel p-state mechanism or the ACPI cpufreq mechanism.
@@ -121,7 +138,7 @@ See [examples/gpu_turbo.cu]
 See [examples/gpu_clocks.cu]
 
 ```c++
-#include "perfect/gpu_clocks.hpp`
+#include "perfect/gpu_clocks.hpp"
 ```
 
 * `Result set_max_gpu_clocks(unsigned int idx)`: Set GPU `idx` clocks to their maximum reported values.
@@ -134,13 +151,23 @@ See [examples/gpu_clocks.cu]
 See [examples/cpu_cache.cpp].
 
 ```c++
-#include "perfect/cpu_cache.hpp`
+#include "perfect/cpu_cache.hpp"
 ```
 
 * `void flush_all(void *p, const size_t n)`: Flush all cache lines starting at `p` for `n` bytes.
 
+## Changelog
+
+* v0.2.0
+    * add GPU monitoring
+* v0.1.0
+    * cache control
+    * Intel P-State control
+    * linux governor control
+    * POWER cpufreq control
+    * Nvidia GPU boost control
+    * Nvidia GPU clock control
+
 ## Wish List
 
-
-- [ ] Nvidia GPU power monitoring
-- [ ] Nivida GPU utilization monitoring
\ No newline at end of file
+- [ ] nothing right now
\ No newline at end of file
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 4f761cd..1730116 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -51,4 +51,9 @@ endif()
 if(CMAKE_CUDA_COMPILER)
     add_executable(gpu-turbo gpu_turbo.cu)
     target_link_libraries(gpu-turbo perfect)
+endif()
+
+if(CMAKE_CUDA_COMPILER)
+    add_executable(gpu-power gpu_power.cu)
+    target_link_libraries(gpu-power perfect)
 endif()
\ No newline at end of file
diff --git a/examples/gpu_monitor.cu b/examples/gpu_monitor.cu
new file mode 100644
index 0000000..2e67331
--- /dev/null
+++ b/examples/gpu_monitor.cu
@@ -0,0 +1,25 @@
+#include <chrono>
+#include <iostream>
+#include <thread>
+
+#include "perfect/gpu_monitor.hpp"
+
+int main(void) {
+  using namespace perfect;
+  init();
+
+  // write to stderr
+  Monitor m(&std::cerr);
+
+  // don't record GPU utilization
+  m.config.utilization = false;
+
+  m.start();
+
+  // ctrl-c to exit
+  while (true) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(5000));
+  }
+
+  return 0;
+}
\ No newline at end of file
diff --git a/include/perfect/gpu_monitor.hpp b/include/perfect/gpu_monitor.hpp
new file mode 100644
index 0000000..0857193
--- /dev/null
+++ b/include/perfect/gpu_monitor.hpp
@@ -0,0 +1,147 @@
+#pragma once
+
+#ifdef __NVCC__
+#ifndef PERFECT_HAS_CUDA
+#define PERFECT_HAS_CUDA
+#endif
+#endif
+
+#ifdef PERFECT_HAS_CUDA
+#include <nvml.h>
+#endif
+
+#include <atomic>
+#include <chrono>
+#include <string>
+#include <thread>
+
+#include <iostream>
+
+#include "perfect/init.hpp"
+
+namespace perfect {
+
+class Monitor {
+public:
+  struct Config {
+    std::atomic<bool> stop;
+    std::atomic<bool> pause;
+    double samplePeriodMs;
+
+    std::atomic<bool> power;
+    std::atomic<bool> utilization;
+    std::atomic<bool> temperature;
+    std::atomic<bool> pstate;
+    std::ostream *stream_;
+
+    Config(std::ostream *stream)
+        : stop(true), pause(false), power(true), samplePeriodMs(100),
+          utilization(true), temperature(true), pstate(true), stream_(stream) {}
+  };
+
+  std::thread worker;
+  Config config;
+
+  Monitor(std::ostream *stream) : config(stream) {}
+
+  static void worker_func(const Config &cfg) {
+
+    nvmlReturn_t ret;
+    nvmlDevice_t device;
+    unsigned int deviceCount;
+
+    nvmlUtilization_t utilization;
+    unsigned int milliwatts;
+    unsigned int temperature;
+    nvmlPstates_t pState;
+
+    ret = nvmlDeviceGetCount(&deviceCount);
+
+    std::chrono::time_point<std::chrono::system_clock> start;
+
+    while (!cfg.stop.load()) {
+      if (std::chrono::time_point<std::chrono::system_clock>() == start) {
+        start = std::chrono::system_clock::now();
+      }
+      if (!cfg.pause.load()) {
+
+        const double elapsed =
+            (std::chrono::system_clock::now() - start).count() / 1e9 * 1e3;
+
+        for (unsigned int i = 0; i < deviceCount; ++i) {
+
+          (*cfg.stream_) << elapsed << "," << i;
+
+          ret = nvmlDeviceGetHandleByIndex(i, &device);
+
+          if (cfg.power.load()) {
+            ret = nvmlDeviceGetPowerUsage(device, &milliwatts);
+            if (ret == NVML_SUCCESS) {
+              (*cfg.stream_) << "," << milliwatts;
+            } else {
+              (*cfg.stream_) << "," << -1;
+            }
+          } else {
+            (*cfg.stream_) << ","
+                           << "x";
+          }
+          if (cfg.utilization.load()) {
+            // period is between 1 second and 1/6 second depending on product
+            ret = nvmlDeviceGetUtilizationRates(device, &utilization);
+            if (ret == NVML_SUCCESS) {
+              (*cfg.stream_)
+                  << "," << utilization.gpu << "," << utilization.memory;
+            } else {
+              (*cfg.stream_) << "," << -1 << "," << -1;
+            }
+          } else {
+            (*cfg.stream_) << ","
+                           << "x"
+                           << ","
+                           << "x";
+          }
+          if (cfg.temperature.load()) {
+            ret = nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU,
+                                           &temperature);
+            if (ret == NVML_SUCCESS) {
+              (*cfg.stream_) << "," << temperature;
+            } else {
+              (*cfg.stream_) << "," << -1;
+            }
+          } else {
+            (*cfg.stream_) << ","
+                           << "x";
+          }
+          if (cfg.pstate.load()) {
+            ret = nvmlDeviceGetPerformanceState(device, &pState);
+            if (ret == NVML_SUCCESS) {
+              (*cfg.stream_) << "," << pState;
+            } else {
+              (*cfg.stream_) << "," << -1;
+            }
+          } else {
+            (*cfg.stream_) << ","
+                           << "x";
+          }
+
+          (*cfg.stream_) << "\n";
+        }
+      }
+      std::this_thread::sleep_for(std::chrono::milliseconds(100));
+    }
+  }
+
+  void start() {
+    config.stop = false;
+    worker = std::thread(worker_func, std::ref(config));
+  }
+  void stop() {
+    config.stop = true;
+    worker.join();
+  }
+
+  void resume() { config.pause.store(false); }
+  void pause() { config.pause.store(true); }
+};
+
+} // namespace perfect
\ No newline at end of file
diff --git a/include/perfect/init.hpp b/include/perfect/init.hpp
index 6a87dc5..1e5acc8 100644
--- a/include/perfect/init.hpp
+++ b/include/perfect/init.hpp
@@ -1,13 +1,17 @@
 #pragma once
 
 #ifdef __NVCC__
+#ifndef PERFECT_HAS_CUDA
 #define PERFECT_HAS_CUDA
 #endif
+#endif
 
 #ifdef PERFECT_HAS_CUDA
 #include <nvml.h>
 #endif
 
+#include "perfect/result.hpp"
+
 namespace perfect {
 
 /*! initialize the benchmark
diff --git a/include/perfect/result.hpp b/include/perfect/result.hpp
index 93ad46e..2c2036b 100644
--- a/include/perfect/result.hpp
+++ b/include/perfect/result.hpp
@@ -3,8 +3,10 @@
 #include <cassert>
 
 #ifdef __NVCC__
+#ifndef PERFECT_HAS_CUDA
 #define PERFECT_HAS_CUDA
 #endif
+#endif
 
 #ifdef PERFECT_HAS_CUDA
 #include <nvml.h>