From 855e9aed9cb1be6630dffc552f062fb6531d5a50 Mon Sep 17 00:00:00 2001
From: Carl Pearson <cwpearson@users.noreply.github.com>
Date: Mon, 8 May 2017 14:06:30 -0700
Subject: [PATCH] Updates from ShareLaTeX

---
 main.tex | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/main.tex b/main.tex
index 5473ea7..86d5f7d 100644
--- a/main.tex
+++ b/main.tex
@@ -121,8 +121,8 @@ A $16$-GPU MPI execution is not shown, as only one S822LC was available for eval
 \end{center}
   \caption{
   MLFMM execution times and speedup over single-threaded execution on Blue Waters XE and XK nodes (a) and S822LC (b).
-  Dark bars represents execution time (left axis).
-  Light bars show speedup normalized to the ``1T'' execution (right axis).
+  Light bars represent execution time (left axis).
+  Dark bars show speedup normalized to the ``1T'' execution (right axis).
   }
   \label{fig:mlfmm_performance}
 \end{figure}
@@ -133,13 +133,13 @@ When more threads than units are created, each unit is more fully-utilized than
 thread-to-unit conditions.
 
 In both systems, using a GPU for MLFMM provides substantial speedup (additional $3.1\times$ on XE/XK, $9.2\times$ on S822LC) over fully utilizing the CPUs.
-In current-generation GPUs like the P100 in S822LC, this speedup justifies the considerable time investmeed in a CUDA implementation.
+In current-generation GPUs like the P100 in S822LC, this speedup justifies the considerable time invested in a CUDA implementation.
 Furthermore, nearly linear scaling when using multiple GPUs is also achieved thanks to overlapping all required MPI communication with GPU computation, for a total speedup of $794\times$ over ``1T'' when using $16$ GPUs on $16$ XK nodes, and $969\times$ when using $4$ GPUs on S822LC.
 This corresponds to a reduction in execution time from approximately $33$ seconds to $40$ milliseconds on XK nodes, and $28$ seconds to $29$ milliseconds on S822LC.
 
 Despite the 5-year gap between deployment of Blue Waters and S822LC, the baseline ``1T'' execution is only $1.2\times$ faster on S822LC than on an XE node.
 This reflects the current slow pace of single-threaded CPU performance improvement in the industry.
-The corresponding single-GPU speedup in S822LC over XK  $4.4\times$.
+The corresponding single-GPU speedup in S822LC over XK is $4.4\times$.
 On a per-node basis (``1 GPU'' in XK, ``4 GPU'' in S822LC), the speedup is $17.9\times$.
 
 \subsection{Computation Kernel Breakdown}
@@ -160,7 +160,7 @@ Fig.~\ref{fig:kernel_breakdown} shows the amount of  of MLFMM execution time spe
   \label{fig:kernel_breakdown}
 \end{figure}
 
-
+The \texttt{L2L} kernels exhibit the 
 
 %This document is a template for authors preparing papers for the
 %CEM'17 Computing and Electromagnetics Workshop in Barcelona, Spain.