diff --git a/cem17_template.tex b/cem17_template.tex index 12e1b3d..a32d24c 100644 --- a/cem17_template.tex +++ b/cem17_template.tex @@ -47,7 +47,7 @@ Fig.~\ref{fig:app_breakdown} shows the amount of time the full inverse-solver ap ``BW (32T)'' corresponds to a 32-thread OpenMP parallel run on a single XE node, and S822LC corresponds to a 160-thread OpenMP parallel run on the S822LC node. Non-MLFMM operations are a minority of the time, and become an even smaller proportion of the time as the object reconstructions grow larger. -\begin{figure}[h] +\begin{figure}[ht] \begin{center} \begin{tabular}{c} \mbox{\psfig{figure=figures/cpu_matvec.pdf,width=8cm}} @@ -56,6 +56,7 @@ Non-MLFMM operations are a minority of the time, and become an even smaller prop \caption{ Amount of application time spent in MLFMM for two different execution environments. MLFMM is the dominant component even with CPU parallelization on a single node. + As object reconstructions grow larger or more challenging, MLFMM time further increases as a proportion of application time. } \label{fig:app_breakdown} \end{figure} @@ -103,27 +104,32 @@ The P100s are connected to the Power8 CPUs via $80$~GB/s NVLink connections. All evaluations are done on a problem with these parameters. \todo{get from mert} -Fig.~\ref{fig:mlfmm_bw} shows the amount of of MLFMM execution time spent in computational kernels. +Fig.~\ref{fig:mlfmm_bw} shows the MLFMM performance scaling on various Blue Waters configurations. -\begin{figure}[b] +\begin{figure}[htbp] \begin{center} \begin{tabular}{c} \mbox{\psfig{figure=figures/mlfmm_bw.pdf,width=8cm}} \end{tabular} \end{center} - \caption{BW.} + \caption{ + BW. + } \label{fig:mlfmm_bw} \end{figure} -Fig.~\ref{fig:mlfmm_minsky} shows the amount of MLFMM execution time spent in computational kernels. +Fig.~\ref{fig:mlfmm_minsky} shows the MLFMM performance scaling for various S822LC configurations. -\begin{figure}[b] + +\begin{figure}[htbp] \begin{center} \begin{tabular}{c} \mbox{\psfig{figure=figures/mlfmm_minsky.pdf,width=8cm}} \end{tabular} \end{center} - \caption{S822LC.} + \caption{ + S822LC. + } \label{fig:mlfmm_minsky} \end{figure} @@ -132,8 +138,12 @@ Fig.~\ref{fig:mlfmm_minsky} shows the amount of MLFMM execution time spent in co \subsection{Computation Kernel Breakdown} Fig.~\ref{fig:kernel_breakdown} shows the amount of of MLFMM execution time spent in computational kernels. +\texttt{P2P} is the ``particle-to-particle'' or nearfield exchanges. +\texttt{P2M} and \texttt{M2M} are the lowest-level and higher-level aggregations, respectively. +\texttt{L2L} and \texttt{L2P} are the higher-level and lowest-level disaggregations, respectively. +\texttt{M2M} is the translations. -\begin{figure}[b] +\begin{figure}[htbp] \begin{center} \begin{tabular}{c} \mbox{\psfig{figure=figures/kernels.pdf,width=8cm}} @@ -145,11 +155,11 @@ Fig.~\ref{fig:kernel_breakdown} shows the amount of of MLFMM execution time spe -This document is a template for authors preparing papers for the -CEM'17 Computing and Electromagnetics Workshop in Barcelona, Spain. -The papers are required to use the IEEE style by following the -instructions provided in this document. The language is English. -The papers are expected to be two-pages long. +%This document is a template for authors preparing papers for the +%CEM'17 Computing and Electromagnetics Workshop in Barcelona, Spain. +%The papers are required to use the IEEE style by following the +%instructions provided in this document. The language is English. +%The papers are expected to be two-pages long. \section{Text Format}