From 41e214500cbc54544abb0ef229909e4ea63eee05 Mon Sep 17 00:00:00 2001
From: Carl Pearson <pearson@illinois.edu>
Date: Thu, 18 May 2017 09:33:50 -0700
Subject: [PATCH] fix acks

---
 figures/cpu_matvec.pdf   | Bin 12050 -> 12050 bytes
 figures/kernels.pdf      | Bin 13466 -> 13466 bytes
 figures/mlfmm.pdf        | Bin 18027 -> 18027 bytes
 figures/mlfmm_bw.pdf     | Bin 16277 -> 16277 bytes
 figures/mlfmm_minsky.pdf | Bin 15684 -> 15684 bytes
 main.tex                 |   3 ++-
 6 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/figures/cpu_matvec.pdf b/figures/cpu_matvec.pdf
index 1d17335536a809793871be5ac699bcc7ad802085..6117160c05abcf1e365552f5b49cb46957464b3e 100644
GIT binary patch
delta 18
ZcmbOfHz{tz3N2PkV`Br$&1<z%SpY*%24esK

delta 18
ZcmbOfHz{tz3N2O(Qv(Bo&1<z%SpY)`23P<9

diff --git a/figures/kernels.pdf b/figures/kernels.pdf
index 394a59e7da69a1b033022153aab8094bdd1182ee..370aaf016eac992d5cd28eb6c90076019a6ab3e3 100644
GIT binary patch
delta 18
acmbQ0IV*F+7kySsV`Br$%|G=QvH$=}jt6G|

delta 18
acmbQ0IV*F+7kyR>Qv(Bo%|G=QvH$=}NC#X1

diff --git a/figures/mlfmm.pdf b/figures/mlfmm.pdf
index 2dc6d1ae4ec1c5e181c53a6aa721f2c87749c11d..c3b0f1d9b16ff0d02dde9eb4ff0a10d50b2c2656 100644
GIT binary patch
delta 20
bcmaFe!}z*~al=aoR!d`JLxaul9BNqsUj_&=

delta 20
bcmaFe!}z*~al=aoRtr-D1H;Yl9BNqsUiAnt

diff --git a/figures/mlfmm_bw.pdf b/figures/mlfmm_bw.pdf
index cee36aa9ffe8fa097fe4b728c7db24eda3cf0a41..5658d9aec82ef6dc6376e89cfb4c894ef036d748 100644
GIT binary patch
delta 18
ZcmbPQKec{?zZI*cv9Y1i<`64KRsceB1@Hg>

delta 18
ZcmbPQKec{?zZI*6seys<<`64KRscd_1?~U<

diff --git a/figures/mlfmm_minsky.pdf b/figures/mlfmm_minsky.pdf
index 31738afcb41359cb74b1f32e377b4af601226e83..fb5e01c614938ac90cd032c7272ed0b42271ed89 100644
GIT binary patch
delta 18
ZcmX?7b);&8l?AJ%v9Y1?W_t@(RscpD1=|1s

delta 18
ZcmX?7b);&8l?AJXseysXW_t@(Rsco{1=#=q

diff --git a/main.tex b/main.tex
index 465c47a..cd78471 100644
--- a/main.tex
+++ b/main.tex
@@ -153,6 +153,7 @@ This reflects the slow pace of single-threaded CPU performance improvement.
 On the other hand, the P100 GPU in S822LC provides 4.4x speedup over the K20x in XK. 
 On a per-node basis the four GPUs in S822LC provide 17.9 speedup over the single GPU in XK.
 
+The nearfield kernel consumes approximately 60\% of the MLFMM time.
 The average kernel-execution speedup moving from K20x to P100 is 5.3x, and the disaggregation kernel speedup is the largest, at 8x.
 On both K20x and P100, this kernel's performance is limited by the amount of CUDA shared memory it requires. 
 In S822LC, the newer Pascal GPU architecture provides 64 KB of shared memory per thread-block rather than the 48 KB on XK, which allows more thread-blocks to run concurrently and provide the disproportionate speedup on that machine.
@@ -166,7 +167,7 @@ This speedup justifies the significant CUDA time investment.
 
 
 \section*{Acknowledgments}
-This work was supported by the NVIDIA GPU Center of Excellence, the NCSA Petascale Improvement Discovery Program, and the IBM-Illinois Center for Cognitive Computing Systems Research (C3SR).
+This work was supported by the NVIDIA GPU Center of Excellence and the NCSA Petascale Improvement Discovery Program (PAID).
 
 \bibliographystyle{IEEEtran}
 \begin{thebibliography}{99}