diff --git a/src/core/device.cc b/src/core/device.cc
index 63708fd..39e6c36 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -1008,8 +1008,8 @@ acDeviceRunMPITest(void)
     acLoadConfig(AC_DEFAULT_CONFIG, &info);
 
     // Large mesh dim
-    const int nn           = 128;
-    const int num_iters    = 10;
+    const int nn           = 512;
+    const int num_iters    = 100;
     info.int_params[AC_nx] = info.int_params[AC_ny] = nn;
     info.int_params[AC_nz]         = BENCH_STRONG_SCALING ? nn : nn * num_processes;
     info.real_params[AC_inv_dsx]   = AcReal(1.0) / info.real_params[AC_dsx];
@@ -1064,6 +1064,14 @@ acDeviceRunMPITest(void)
     acDeviceCreate(pid % devices_per_node, submesh_info, &device);
     acDeviceLoadMesh(device, STREAM_DEFAULT, submesh);
 
+    // Enable peer access
+    MPI_Barrier(MPI_COMM_WORLD);
+    const int front = (device->id + 1) % devices_per_node;
+    const int back  = (device->id + devices_per_node - 1) % devices_per_node;
+    cudaSetDevice(device->id);
+    WARNCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(front, 0));
+    WARNCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(back, 0));
+
 // Verification start ///////////////////////////////////////////////////////////////////////
 #if BENCH_STRONG_SCALING
     {
@@ -1094,6 +1102,13 @@ acDeviceRunMPITest(void)
 #endif
     // Verification end ///////////////////////////////////////////////////////////////////////
 
+    // Warmup
+    for (int i = 0; i < 10; ++i)
+        acDeviceIntegrateStepMPI(device, 0);
+
+    acDeviceSynchronizeStream(device, STREAM_ALL);
+    MPI_Barrier(MPI_COMM_WORLD);
+
     // Benchmark start ///////////////////////////////////////////////////////////////////////
     std::vector<double> results;
     results.reserve(num_iters);
@@ -1114,7 +1129,7 @@ acDeviceRunMPITest(void)
     }
 
     const double ms_elapsed     = timer_diff_nsec(total_time) / 1e6;
-    const double nth_percentile = 0.95;
+    const double nth_percentile = 0.90;
     std::sort(results.begin(), results.end(),
               [](const double& a, const double& b) { return a < b; });
 
diff --git a/src/standalone/benchmark.cc b/src/standalone/benchmark.cc
index f8dcdf2..976e745 100644
--- a/src/standalone/benchmark.cc
+++ b/src/standalone/benchmark.cc
@@ -49,14 +49,15 @@ smaller_than(const double& a, const double& b)
 int
 run_benchmark(const char* config_path)
 {
-    const int nn        = 256;
+    const int nn        = 512;
     const int num_iters = 100;
+#define BENCH_STRONG_SCALING (1)
+    const int num_processes = acGetNumDevicesPerNode();
 
     AcMeshInfo mesh_info;
     load_config(config_path, &mesh_info);
-    mesh_info.int_params[AC_nx] = nn;
-    mesh_info.int_params[AC_ny] = mesh_info.int_params[AC_nx];
-    mesh_info.int_params[AC_nz] = mesh_info.int_params[AC_nx];
+    mesh_info.int_params[AC_nx] = mesh_info.int_params[AC_ny] = nn;
+    mesh_info.int_params[AC_nz] = BENCH_STRONG_SCALING ? nn : nn * num_processes;
     update_config(&mesh_info);
 
     AcMesh* mesh = acmesh_create(mesh_info);
@@ -74,6 +75,7 @@ run_benchmark(const char* config_path)
     }
     acSynchronize();
     const AcReal dt = FLT_EPSILON;
+    printf("Using dt = %g\n", dt);
 
     Timer total_time;
     timer_reset(&total_time);
@@ -89,13 +91,25 @@ run_benchmark(const char* config_path)
     }
     acSynchronize();
     const double ms_elapsed     = timer_diff_nsec(total_time) / 1e6;
-    const double nth_percentile = 0.95;
+    const double nth_percentile = 0.90;
     std::sort(results.begin(), results.end(), smaller_than);
 
     printf("vertices: %d^3, iterations: %d\n", nn, num_iters);
     printf("Total time: %f ms\n", ms_elapsed);
-    printf("%dth percentile per step: %f ms\n", int(100 * nth_percentile),
-           results[int(nth_percentile * num_iters)]);
+    printf("Time per step: %f ms\n", ms_elapsed / num_iters);
+
+    const size_t nth_index = int(nth_percentile * num_iters);
+    printf("%dth percentile per step: %f ms\n", int(100 * nth_percentile), results[nth_index]);
+
+    // Write out
+    char buf[256];
+    sprintf(buf, "nprocs_%d_result_%s.bench", num_processes,
+            BENCH_STRONG_SCALING ? "strong" : "weak");
+    FILE* fp = fopen(buf, "w");
+    ERRCHK_ALWAYS(fp);
+    fprintf(fp, "num_processes, percentile (%dth)\n", int(100 * nth_percentile));
+    fprintf(fp, "%d, %g\n", num_processes, results[nth_index]);
+    fclose(fp);
 
     acQuit();
     acmesh_destroy(mesh);