From f04708f3d486ff79fda54848def51f9f0af1231b Mon Sep 17 00:00:00 2001
From: Carl Pearson <cwpearson@users.noreply.github.com>
Date: Tue, 9 May 2017 10:43:37 -0700
Subject: [PATCH] Updates from ShareLaTeX

---
 main.tex | 155 +++++++++++++++++++++++++++----------------------------
 1 file changed, 76 insertions(+), 79 deletions(-)

diff --git a/main.tex b/main.tex
index cb0b293..a7ba3a6 100644
--- a/main.tex
+++ b/main.tex
@@ -31,8 +31,8 @@ $~^{2}$Second Affiliation, City, Postal Code, Country\\
 
 \begin{abstract}
 The multilevel fast multiple method (MLFMM) is a key tool for efficiently solving large scattering problems govered by the Hemholtz equation.
-Highly inhomogeneous media prevents converting the problem into a surface-scattering problem via equivalence principle, and therefore we solve the corresponding volume integral equation.
-We evaluate an efficient implementation of MLFMM for such two-dimensional volumetric scattering problems on high-performance GPU-accelerated supercomputing nodes, where up to 969x speedup is achieved over single-thread CPU execution.
+Highly inhomogeneous media prevents converting the problem into a surface-scattering problem via equivalence principle, and therefore the problem is solved using the corresponding volume integral equation.
+We evaluate an efficient implementation of MLFMM for such two-dimensional volumetric scattering problems on high-performance GPU-accelerated supercomputing nodes, where up to 969x speedup is achieved over single-thread CPU execution using 4 NVIDIA P100 GPUs .
 
 \end{abstract}
 
@@ -48,35 +48,12 @@ During the MLFMM multiplications, data is transferred between GPUs through their
 To hide this communication cost, MPI communication is overlapped with a long-running GPU kernel through a reordering of the MLFMM operations.
 This strategy completely hides the communication cost and provides $96$\%, MPI parallelization efficiency on up to 16 GPUs.
 
-\section{MLFMM Contribution to Application Time}
-\label{sec:application}
-
-Fig.~\ref{fig:app_breakdown} shows the amount of time the full inverse-solver application spends on MFLMM in two parallelized CPU executions.
-The MLFMM execution parameters are described in Section \ref{sec:results}.
-MLFMM is the dominant application component, responsible for 72\% of the execution time on a single XE node and 83\% of time on S822LC \textit{after} full CPU parallelization.
-This proportion grows arbitrarily close to $1.0$ as the scattering problems become larger or more challenging, justifying further targeted acceleration of MLFMM.
-
-\begin{figure}[ht]
-\begin{center}
-\begin{tabular}{c}
-\mbox{\psfig{figure=figures/cpu_matvec.pdf,width=8cm}}
-\end{tabular}
-\end{center}
-  \caption{
-  Amount of application time spent in MLFMM for two different execution environments.
-  XE (32T) corresponds to a 32-thread OpenMP parallel run on a single XE node, and S822LC corresponds to a 160-thread OpenMP parallel run on the S822LC node.
-  MLFMM is the dominant application component even with CPU parallelization.
-  As object reconstructions grow larger or more challenging, MLFMM time further increases as a proportion of application time.
-  }
-  \label{fig:app_breakdown}
-\end{figure}
 
 
 \section{MLFMM Performance Results}
 \label{sec:results}
 
-As described in Section \ref{sec:application} and shown in Fig. \ref{fig:app_breakdown}, the MLFMM realization of matrix-vector multiplications forms the core computational kernel of the application, and its performance dominates that of the full inverse solver.
-This section presents an analysis of the performance of the MLFMM algorithm in three different environments.
+This section presents an analysis of the performance of the MLFMM algorithm on different computing systems.
 
 \subsection{Evaluation Environments}
 
@@ -99,7 +76,7 @@ This section presents an analysis of the performance of the MLFMM algorithm in t
 %\end{table}
 
 The performance of MLFMM is evaluated on three different computing systems: Blue Waters XE nodes, Blue Waters XK nodes, and an IBM S822LC.
-The Blue Waters XE and XK nodes are two different kinds of computing nodes available on the Blue Waters supercomputer\cite{ncsa}.
+The Blue Waters XE and XK nodes are two different kinds of computing nodes available on the Blue Waters supercomputer~\cite{ncsa}.
 Each Blue Waters node is a two-socket system: the XE node has two AMD Opteron 6276 CPUs, each with eight floating-point units, hardware support for 16 executing threads, and $32$~GB of RAM.
 The XK node replaces one of these CPUs with an NVIDIA K20X GPU with the Kepler architecture and $6$~GB of RAM.
 The K20x is connected to the Operton 6276 with PCIe.
@@ -109,6 +86,29 @@ It has two IBM Power8 CPUs with ten floating-point units, support for 80 executi
 In addition, each Minsky machine has four NVIDIA P100 GPUs Pascal-architecture GPUs with $16$~GB of RAM.
 The P100s are connected to the Power8 CPUs via $80$~GB/s NVLink connections.
 
+\subsection{MLFMM Contribution to Application Time}
+
+The MLFMM realization of matrix-vector multiplications forms the core computational kernel of the application, and its performance dominates that of the full inverse solver.
+Fig.~\ref{fig:app_breakdown} shows the amount of time the full inverse-solver application spends on MFLMM in two parallelized CPU executions.
+MLFMM is responsible for 72\% of the execution time on a single XE node and 83\% of time on S822LC \textit{after} full CPU parallelization.
+This proportion grows arbitrarily close to $1.0$ as the scattering problems become larger or more challenging, justifying further targeted acceleration of MLFMM.
+
+\begin{figure}[t]
+\begin{center}
+\begin{tabular}{c}
+\mbox{\psfig{figure=figures/cpu_matvec.pdf,width=8cm}}
+\end{tabular}
+\end{center}
+  \caption{
+  Amount of application time spent in MLFMM for a 32-thread CPU run on an XE node (left) and a 160-thread run on S822LC (right).
+  MLFMM is the dominant application component even with CPU parallelization.
+  As object reconstructions grow larger or more challenging, MLFMM time further increases as a proportion of application time.
+  }
+  \label{fig:app_breakdown}
+\end{figure}
+
+
+
 \subsection{MLFMM Performance}
 
 All evaluations are done on a problem with these parameters. \todo{get from mert}
@@ -137,7 +137,7 @@ A $16$-GPU MPI execution is not shown, as only one S822LC was available for eval
 \end{figure}
 
 
-Both XE and S822LC achieve more CPU speedup than they have floating-point units ($17\times$ at $32$ threads on $16$ units for XE, $26\times$ at $160$ threads on $20$ units for S822LC).
+Both XE and S822LC achieve more CPU speedup than they have floating-point units ($17\times$ with $32$ threads on $16$ units for XE, $26\times$ with $160$ threads on $20$ units for S822LC).
 When more threads than units are created, each unit is more fully-utilized than it would be under one-to-one
 thread-to-unit conditions.
 
@@ -147,14 +147,12 @@ Furthermore, nearly linear scaling when using multiple GPUs is also achieved tha
 This corresponds to a reduction in execution time from approximately $33$ seconds to $40$ milliseconds on XK nodes, and $28$ seconds to $29$ milliseconds on S822LC.
 
 Despite the 5-year gap between deployment of Blue Waters and S822LC, the baseline ``1T'' execution is only $1.2\times$ faster on S822LC than on an XE node.
-This reflects the current slow pace of single-threaded CPU performance improvement in the industry.
-The corresponding single-GPU speedup in S822LC over XK is $4.4\times$.
-On a per-node basis (``1 GPU'' in XK, ``4 GPU'' in S822LC), the speedup is $17.9\times$.
+This reflects the current slow pace of single-threaded CPU performance improvement.
+On the other hand, the P100 GPU in S822LC provides $4.4\times$ speedup over the K20x in XK. 
+On a per-node basis the four GPUs in S822LC provide $17.9\times$ speedup over the single GPU in XK.
 
 \subsection{MPI Communication Overlap}
 
-\tikzstyle{int}=[draw, fill=blue!20, minimum size=2em]
-\tikzstyle{init} = [pin edge={to-,thin,black}]
 
 
 \subsection{Computation Kernel Breakdown}
@@ -183,6 +181,50 @@ The average GPU kernel speedup on four GPU moving from XK to S822LC is $5.3\time
 On both XK and S822LC, this kernel's performance is limited by the amount of CUDA shared memory it requires. 
 In S822LC, the newer Pascal GPU architecture provides $64$~KB of shared memory per thread-block rather than the $48$~KB on XK, which allows more thread-blocks to run concurrently and provide the disproportionate speedup on that machine. 
 
+
+% the following vfill coarsely balances the columns on the last page
+\vfill \pagebreak
+
+\section{Conclusions}
+This paper presents MLFMM performance results on three types of computer systems: Blue Waters XE and XK nodes, and an IBM S822LC. 
+MLFMM is realized as matrix operations.
+Significant CPU speedup on both systems is achieved with OpenMP, and further eclipsed by CUDA implementations that take advantage of well-understood matrix optimization techniques, up to a speedup of $969\times$ over single-threaded CPU execution on S822LC, bringing execution times from seconds to milliseconds even for large problems.
+On modern GPUs, this speedup justifies the significant CUDA time investment.
+
+
+\section*{Acknowledgment}
+%Acknowledgments should be here.
+
+\bibliographystyle{IEEEtran}
+\begin{thebibliography}{99}
+\bibitem{ncsa} 
+National Center for Supercomputing Applications, 
+``System Summary,'' 
+[online]
+Available: https://bluewaters.ncsa.illinois.edu/hardware-summary.
+[Accessed: 8-May-2017].
+
+%\bibitem{journal} A.~Author, B.~Author, and C.~Author,
+%``Publication title,'' {\it Journal Title}, vol.~0, no.~0,
+%pp.~00--00, Month~Year.
+
+%\bibitem{book1} A.~Author, B.~Author, and C.~Author,
+%{\it Book Title}. Location: Publisher,~Year.
+
+%\bibitem{book2} A.~Author, B.~Author, and C.~Author,
+%``Chapter title,'' in {\it Book Title}, A.~Editor,~Ed. Location:
+%Publisher,~Year,~Chap.~0.
+
+%\bibitem{conf1} A.~Author, B.~Author, and C.~Author, ``Paper
+%title,'' in {\it Proc. Conference Title}, vol.~0, Year, pp.~0--0.
+
+%\bibitem{conf2} A.~Author, B.~Author, and C.~Author, ``Paper
+%title,'' {\it Conference Title}, Location, Country, Month~Year.
+
+\end{thebibliography}
+
+\end{document}
+
 %This document is a template for authors preparing papers for the
 %CEM'17 Computing and Electromagnetics Workshop in Barcelona, Spain.
 %The papers are required to use the IEEE style by following the
@@ -248,55 +290,10 @@ In S822LC, the newer Pascal GPU architecture provides $64$~KB of shared memory p
 %\end{tabular}
 %\end{table}
 
-
-
 %\section{References} 
 %The heading of the references section is
 %not be numbered and all reference items are in 8~pt font.
 %References are required to be in IEEE style.  Please refer to the
 %examples for journals~\cite{journal}, for
 %books~\cite{book1},~\cite{book2}, and for conference
-%papers~\cite{conf1},~\cite{conf2}.
-
-% the following vfill coarsely balances the columns on the last page
-\vfill \pagebreak
-
-\section{Conclusions}
-This paper presents MLFMM performance results on three types of computer systems: Blue Waters XE and XK nodes, and an IBM S822LC. 
-MLFMM is realized as matrix operations.
-Significant CPU speedup on both systems is achieved with OpenMP, and further eclipsed by CUDA implementations that take advantage of well-understood matrix optimization techniques, up to a speedup of $969\times$ over single-threaded CPU execution on S822LC, bringing execution times from seconds to milliseconds even for large problems.
-On modern GPUs, this speedup justifies the significant CUDA time investment.
-
-
-\section*{Acknowledgment}
-%Acknowledgments should be here.
-
-\bibliographystyle{IEEEtran}
-\begin{thebibliography}{99}
-\bibitem{ncsa} 
-National Center for Supercomputing Applications, 
-``System Summary,'' 
-[online]
-Available: https://bluewaters.ncsa.illinois.edu/hardware-summary.
-[Accessed: 8-May-2017].
-
-%\bibitem{journal} A.~Author, B.~Author, and C.~Author,
-%``Publication title,'' {\it Journal Title}, vol.~0, no.~0,
-%pp.~00--00, Month~Year.
-
-%\bibitem{book1} A.~Author, B.~Author, and C.~Author,
-%{\it Book Title}. Location: Publisher,~Year.
-
-%\bibitem{book2} A.~Author, B.~Author, and C.~Author,
-%``Chapter title,'' in {\it Book Title}, A.~Editor,~Ed. Location:
-%Publisher,~Year,~Chap.~0.
-
-%\bibitem{conf1} A.~Author, B.~Author, and C.~Author, ``Paper
-%title,'' in {\it Proc. Conference Title}, vol.~0, Year, pp.~0--0.
-
-%\bibitem{conf2} A.~Author, B.~Author, and C.~Author, ``Paper
-%title,'' {\it Conference Title}, Location, Country, Month~Year.
-
-\end{thebibliography}
-
-\end{document}
+%papers~\cite{conf1},~\cite{conf2}.
\ No newline at end of file