Compare commits

...

10 Commits

Author SHA1 Message Date
Carl Pearson
41e214500c fix acks 2017-05-18 09:33:50 -07:00
Carl Pearson
8d32532f8c Merge branch 'master' of github.com:cwpearson/cem17 2017-05-18 08:50:34 -07:00
Carl Pearson
64d9383365 add submission target to makefile 2017-05-18 08:50:29 -07:00
Carl Pearson
85a6e5a1ff Updates from ShareLaTeX 2017-05-18 08:42:54 -07:00
Carl Pearson
0cba4b131e Merge sharelatex-2017-05-18-1536 into master 2017-05-18 08:36:35 -07:00
Carl Pearson
68a9ced550 Updates from ShareLaTeX 2017-05-18 08:36:34 -07:00
Carl Pearson
691ce0e527 figures 2017-05-18 08:35:15 -07:00
Carl Pearson
91069b54fa final revisions 2017-05-18 08:33:56 -07:00
Carl Pearson
f0ac93a60a parallelize figure generation 2017-05-16 11:47:15 -07:00
Carl Pearson
43630acf70 Updates from ShareLaTeX 2017-05-15 07:51:32 -07:00
12 changed files with 197 additions and 164 deletions

View File

@@ -4,31 +4,37 @@ PROJ = main
TEXFILE = main.tex TEXFILE = main.tex
PYTHON = python PYTHON = python
all: pdf all: pdf submission
# pdf: $(TEXFILE) # pdf: $(TEXFILE)
# $(TEX) -shell-escape $< # $(TEX) -shell-escape $<
# $(BIBTEX) $(PROJ) # $(BIBTEX) $(PROJ)
# $(TEX) -shell-escape $< # $(TEX) -shell-escape $<
pdf: $(TEXFILE)
figures/%.pdf: figures/%.py
$(PYTHON) $<
pdf: $(TEXFILE) figures/kernels.pdf figures/mlfmm.pdf figures/cpu_matvec.pdf
$(PYTHON) figures/plots.py $(PYTHON) figures/plots.py
$(TEX) -shell-escape $< $(TEX) -shell-escape $<
$(TEX) -shell-escape $< $(TEX) -shell-escape $<
watch: submission: pdf
$(info ************ WATCHING FOR CHANGES ************) zip -r cem17.zip main.pdf figures/kernels.pdf figures/mlfmm.pdf figures/cpu_matvec.pdf
watchman watch $(shell pwd)
watchman -- trigger $(shell pwd) pyfiles 'figures/*.py' -- ls -l
unwatch:
$(info ************ CANCELLING WATCH ************)
watchman watch-del "$(shell pwd)"
clean: clean:
rm -f \ rm -f \
cem17.zip \
$(TEXFILE:.tex=.pdf) \ $(TEXFILE:.tex=.pdf) \
$(TEXFILE:.tex=.aux) \ $(TEXFILE:.tex=.aux) \
$(TEXFILE:.tex=.log) \ $(TEXFILE:.tex=.log) \
$(TEXFILE:.tex=.toc) \ $(TEXFILE:.tex=.toc) \
figures/kernels.pdf \
figures/kernels.png \
figures/mlfmm.pdf \
figures/mlfmm.png \
figures/cpu_matvec.pdf \
figures/cpu_matvec.png \
texput.log \ texput.log \
main.pyg main.pyg
rm -rf \ rm -rf \

Binary file not shown.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 54 KiB

After

Width:  |  Height:  |  Size: 49 KiB

26
figures/cpu_matvec.py Normal file
View File

@@ -0,0 +1,26 @@
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math
sns.set(style="white", context="talk")
plt.rcdefaults()
DPI=300
BBOX_INCHES='tight'
path = 'figures/cpu_matvec'
fig, ax = plt.subplots(figsize=(6, 3))
systems = ('XE (32T)', "S822LC (160T)")
mlfmm = (8.65e1, 4.84e1)
total = (1.2e2, 5.77e1)
x_pos = np.arange(len(systems))
ax.bar(x_pos, mlfmm, color='0.4', label='MLFMM')
ax.bar(x_pos, [i-j for i,j in zip(total, mlfmm)], color='0.8', bottom=mlfmm, label='Non-MLFMM')
ax.set_xticks(x_pos)
ax.set_xticklabels(systems)
ax.set_ylabel("Execution Time (s)")
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, labels)
sns.despine(trim=True)
plt.savefig(path+'.pdf', bbox_inches=BBOX_INCHES)
plt.savefig(path+'.png', dpi=DPI, bbox_inches=BBOX_INCHES)

Binary file not shown.

49
figures/kernels.py Normal file
View File

@@ -0,0 +1,49 @@
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math
sns.set(style="white", context="talk")
plt.rcdefaults()
DPI=300
BBOX_INCHES='tight'
path = 'figures/kernels'
fig, ax = plt.subplots()
systems = ['32T\n(1 XE)', "160T\n(1 S822LC)", "4 GPU\n(4 XK)" ,"4 GPU\n(1 S822LC)"]
x_pos = np.arange(len(systems))
p2m = (127.10, 72.10749, 7.73, 1.604)
m2m = (156.2506, 102.61091, 9.613814, 1.746476)
m2l = (189.615, 82.67791, 18.177774, 2.671025)
l2l = (91.5957, 101.56461, 20.215436, 2.611185)
l2p = (196.2115, 68.38529, 6.994, 1.395)
p2p = (1117.368, 590.4818, 90.619, 18.265)
total = [sum(i) for i in zip(p2m,m2m,m2l,l2l,l2p,p2p)]
p2m_ratio = [i/j for i,j in zip(p2m, total)]
m2m_ratio = [i/j for i,j in zip(m2m, total)]
m2l_ratio = [i/j for i,j in zip(m2l, total)]
l2l_ratio = [i/j for i,j in zip(l2l, total)]
l2p_ratio = [i/j for i,j in zip(l2p, total)]
p2p_ratio = [i/j for i,j in zip(p2p, total)]
ax.bar(x_pos, p2m_ratio, color='0.75', label='P2M', bottom=[sum(i) for i in zip(m2m_ratio, m2l_ratio, l2l_ratio, l2p_ratio,p2p_ratio)])
ax.bar(x_pos, m2m_ratio, color='0.50', label='M2M', bottom=[sum(i) for i in zip(m2l_ratio, l2l_ratio, l2p_ratio,p2p_ratio)])
ax.bar(x_pos, m2l_ratio, color='0.40', label='M2L', bottom=[sum(i) for i in zip(l2l_ratio, l2p_ratio,p2p_ratio)])
ax.bar(x_pos, l2l_ratio, color='0.25', label='L2L', bottom=[sum(i) for i in zip(l2p_ratio,p2p_ratio)])
ax.bar(x_pos, l2p_ratio, color='0.15', label='L2P', bottom=p2p_ratio)
ax.bar(x_pos, p2p_ratio, color='0', label='P2P')
ax.set_xticks(x_pos)
ax.set_xticklabels(systems)
ax.set_ylabel("MLFMM Kernel Breakdown")
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, labels)
# plt.ylim([1, 1e4])
# ax.set_title('How fast do you want to go today?')
sns.despine(trim=True)
legend = plt.legend(frameon = 1)
frame = legend.get_frame()
# frame.set_facecolor('white')
# frame.set_edgecolor('white')
frame.set_alpha(0.9)
plt.savefig(path+'.pdf', bbox_inches=BBOX_INCHES)
plt.savefig(path+'.png', dpi=DPI, bbox_inches=BBOX_INCHES)

Binary file not shown.

71
figures/mlfmm.py Normal file
View File

@@ -0,0 +1,71 @@
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math
sns.set(style="white", context="talk")
plt.rcdefaults()
DPI=300
BBOX_INCHES='tight'
def autolabel(ax, rect):
"""
Attach a text label above each bar displaying its height
"""
height = rect.get_height()
ax.text(rect.get_x() + rect.get_width()/2., 1.05*height,
'%d' % int(round(height)),
ha='center', va='bottom')
path="figures/mlfmm"
width=0.33
WALLTIME="0.8"
SPEEDUP="0.5"
fig, axes = plt.subplots(2,1)
ax=axes[0]
ax.set_title("(a)")
systems = ('1T\n(1 XE)', "32T\n(1 XE)", "1 GPU\n(1 XK)" ,"4 GPU\n(4 XK)", "16 GPU\n(16 XK)")
mlfmm = (1.50e6, 8.64e4, 2.783779e4, 7.01e3, 1.89e3)
num = (45,45,45,45,47)
x_pos = np.arange(len(systems))
rects = ax.bar([p-width/2 for p in x_pos], [i/j for i,j in zip(mlfmm,num)], color=WALLTIME, log=True, width=width)
autolabel(ax, rects[0])
autolabel(ax, rects[1])
autolabel(ax, rects[2])
autolabel(ax, rects[3])
autolabel(ax, rects[4])
ax.set_ylim(1,2e5)
ax2=ax.twinx()
rects = ax2.bar([p+width/2 for p in x_pos], [mlfmm[0] / i for i in mlfmm], color=SPEEDUP, log=True, width=width)
autolabel(ax2, rects[1])
autolabel(ax2, rects[2])
autolabel(ax2, rects[3])
autolabel(ax2, rects[4])
ax2.set_ylim(1,5e3)
ax.set_xticks(x_pos)
ax.set_xticklabels(systems)
ax=axes[1]
ax.set_title("(b)")
systems = ('1T\n(1 S822LC)', "160T\n(1 S822LC)", "1 GPU\n(1 S822LC)" ,"4 GPU\n(1 S822LC)")
mlfmm = [1.25e6, 4.84e4, 5.22e3, 1.29e3]
num = (44,44,44,44)
x_pos = np.arange(len(systems))
rects = ax.bar([p-width/2 for p in x_pos], [i/j for i,j in zip(mlfmm,num)], color=WALLTIME, log=True, width=width)
autolabel(ax, rects[0])
autolabel(ax, rects[1])
autolabel(ax, rects[2])
autolabel(ax, rects[3])
ax.set_ylim(1,2e5)
ax2=ax.twinx()
rects = ax2.bar([p+width/2 for p in x_pos], [mlfmm[0] / i for i in mlfmm], color=SPEEDUP, log=True, width=width)
autolabel(ax2, rects[1])
autolabel(ax2, rects[2])
autolabel(ax2, rects[3])
ax2.set_ylim(1,5e3)
ax.set_xticks(x_pos)
ax.set_xticklabels(systems)
fig.text(-0.01, 0.5, "Per-MLFMM Execution Time (ms)", va='center', rotation='vertical')
fig.text(0.99, 0.5, "Speedup over Sequential", va='center', rotation='vertical')
fig.tight_layout()
plt.savefig(path+'.pdf', bbox_inches=BBOX_INCHES)
plt.savefig(path+'.png', dpi=DPI, bbox_inches=BBOX_INCHES)

Binary file not shown.

Binary file not shown.

View File

@@ -16,67 +16,6 @@ def autolabel(ax, rect):
'%d' % int(round(height)), '%d' % int(round(height)),
ha='center', va='bottom') ha='center', va='bottom')
path = 'figures/cpu_matvec'
fig, ax = plt.subplots(figsize=(6, 3))
systems = ('XE (32T)', "S822LC (160T)")
mlfmm = (8.65e4, 4.84e4)
total = (1.2e5, 5.77e4)
x_pos = np.arange(len(systems))
ax.bar(x_pos, mlfmm, color='0.4', label='MLFMM')
ax.bar(x_pos, [i-j for i,j in zip(total, mlfmm)], color='0.8', bottom=mlfmm, label='Non-MLFMM')
ax.set_xticks(x_pos)
ax.set_xticklabels(systems)
ax.set_ylabel("Execution Time (ms)")
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, labels)
sns.despine(trim=True)
plt.savefig(path+'.pdf', bbox_inches=BBOX_INCHES)
plt.savefig(path+'.png', dpi=DPI, bbox_inches=BBOX_INCHES)
print path
path = 'figures/kernels'
fig, ax = plt.subplots()
systems = ['32T\n(1 XE)', "160T\n(1 S822LC)", "4 GPU\n(4 XK)" ,"4 GPU\n(1 S822LC)"]
x_pos = np.arange(len(systems))
p2m = (127.10, 72.10749, 7.73, 1.604)
m2m = (156.2506, 102.61091, 9.613814, 1.746476)
m2l = (189.615, 82.67791, 18.177774, 2.671025)
l2l = (91.5957, 101.56461, 20.215436, 2.611185)
l2p = (196.2115, 68.38529, 6.994, 1.395)
p2p = (1117.368, 590.4818, 90.619, 18.265)
total = [sum(i) for i in zip(p2m,m2m,m2l,l2l,l2p,p2p)]
p2m_ratio = [i/j for i,j in zip(p2m, total)]
m2m_ratio = [i/j for i,j in zip(m2m, total)]
m2l_ratio = [i/j for i,j in zip(m2l, total)]
l2l_ratio = [i/j for i,j in zip(l2l, total)]
l2p_ratio = [i/j for i,j in zip(l2p, total)]
p2p_ratio = [i/j for i,j in zip(p2p, total)]
ax.bar(x_pos, p2m_ratio, color='0.75', label='P2M', bottom=[sum(i) for i in zip(m2m_ratio, m2l_ratio, l2l_ratio, l2p_ratio,p2p_ratio)])
ax.bar(x_pos, m2m_ratio, color='0.50', label='M2M', bottom=[sum(i) for i in zip(m2l_ratio, l2l_ratio, l2p_ratio,p2p_ratio)])
ax.bar(x_pos, m2l_ratio, color='0.40', label='M2L', bottom=[sum(i) for i in zip(l2l_ratio, l2p_ratio,p2p_ratio)])
ax.bar(x_pos, l2l_ratio, color='0.25', label='L2L', bottom=[sum(i) for i in zip(l2p_ratio,p2p_ratio)])
ax.bar(x_pos, l2p_ratio, color='0.15', label='L2P', bottom=p2p_ratio)
ax.bar(x_pos, p2p_ratio, color='0', label='P2P')
ax.set_xticks(x_pos)
ax.set_xticklabels(systems)
ax.set_ylabel("MLFMM Kernel Breakdown")
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, labels)
# plt.ylim([1, 1e4])
# ax.set_title('How fast do you want to go today?')
sns.despine(trim=True)
legend = plt.legend(frameon = 1)
frame = legend.get_frame()
# frame.set_facecolor('white')
# frame.set_edgecolor('white')
frame.set_alpha(0.9)
plt.savefig(path+'.pdf', bbox_inches=BBOX_INCHES)
plt.savefig(path+'.png', dpi=DPI, bbox_inches=BBOX_INCHES)
print path
path='figures/mlfmm_bw' path='figures/mlfmm_bw'
width=0.33 width=0.33
@@ -138,58 +77,3 @@ print path
path="figures/mlfmm"
width=0.33
WALLTIME="0.8"
SPEEDUP="0.5"
fig, axes = plt.subplots(2,1)
ax=axes[0]
ax.set_title("(a)")
systems = ('1T\n(1 XE)', "32T\n(1 XE)", "1 GPU\n(1 XK)" ,"4 GPU\n(4 XK)", "16 GPU\n(16 XK)")
mlfmm = (1.50e6, 8.64e4, 2.783779e4, 7.01e3, 1.89e3)
num = (45,45,45,45,47)
x_pos = np.arange(len(systems))
rects = ax.bar([p-width/2 for p in x_pos], [i/j for i,j in zip(mlfmm,num)], color=WALLTIME, log=True, width=width)
autolabel(ax, rects[0])
autolabel(ax, rects[1])
autolabel(ax, rects[2])
autolabel(ax, rects[3])
autolabel(ax, rects[4])
ax.set_ylim(1,2e5)
ax2=ax.twinx()
rects = ax2.bar([p+width/2 for p in x_pos], [mlfmm[0] / i for i in mlfmm], color=SPEEDUP, log=True, width=width)
autolabel(ax2, rects[1])
autolabel(ax2, rects[2])
autolabel(ax2, rects[3])
autolabel(ax2, rects[4])
ax2.set_ylim(1,5e3)
ax.set_xticks(x_pos)
ax.set_xticklabels(systems)
ax=axes[1]
ax.set_title("(b)")
systems = ('1T\n(1 S822LC)', "160T\n(1 S822LC)", "1 GPU\n(1 S822LC)" ,"4 GPU\n(1 S822LC)")
mlfmm = [1.25e6, 4.84e4, 5.22e3, 1.29e3]
num = (44,44,44,44)
x_pos = np.arange(len(systems))
rects = ax.bar([p-width/2 for p in x_pos], [i/j for i,j in zip(mlfmm,num)], color=WALLTIME, log=True, width=width)
autolabel(ax, rects[0])
autolabel(ax, rects[1])
autolabel(ax, rects[2])
autolabel(ax, rects[3])
ax.set_ylim(1,2e5)
ax2=ax.twinx()
rects = ax2.bar([p+width/2 for p in x_pos], [mlfmm[0] / i for i in mlfmm], color=SPEEDUP, log=True, width=width)
autolabel(ax2, rects[1])
autolabel(ax2, rects[2])
autolabel(ax2, rects[3])
ax2.set_ylim(1,5e3)
ax.set_xticks(x_pos)
ax.set_xticklabels(systems)
fig.text(-0.01, 0.5, "Per-MLFMM Execution Time (ms)", va='center', rotation='vertical')
fig.text(0.99, 0.5, "Speedup over Sequential", va='center', rotation='vertical')
fig.tight_layout()
plt.savefig(path+'.pdf', bbox_inches=BBOX_INCHES)
plt.savefig(path+'.png', dpi=DPI, bbox_inches=BBOX_INCHES)
print path

View File

@@ -16,15 +16,14 @@
\usepackage{verbatim} \usepackage{verbatim}
\title{Evaluating MLFMM Performance for 2-D VIE Problems on Multiple Architectures} \title{Evaluating MLFMM Performance for 2-D VIE Problems on Multiple-GPU Architectures}
\author{ \author{
{Carl Pearson{\small $^{1}$}, Mert Hidayetoglu{\small $^{1}$}, Wei Ren{\small $^{2}$}, Levent Gurel{\small $^{1}$}, and Wen-Mei Hwu{\small $^{1}$} } {Carl Pearson{\small $^{1}$}, Mert Hidayeto\u{g}lu{\small $^{1}$}, Wei Ren{\small $^{2}$}, Levent G\"{u}rel{\small $^{1}$}, and Wen-Mei Hwu{\small $^{1}$} }
\vspace{1.6mm}\\ \vspace{1.6mm}\\
\fontsize{10}{10}\selectfont\itshape \fontsize{10}{10}\selectfont\itshape
$~^{1}$Department of Electrical and Computer Engineering, University of Illinois Urbana-Champaign, Urbana, IL 61801, USA\\ $~^{1}$Department of Electrical and Computer Engineering, University of Illinois Urbana-Champaign, Urbana, IL 61801, USA\\
$~^{2}$Department of Physics, University of Illinois Urbana-Champaign, Urbana, IL 61801, USA\\ $~^{2}$Department of Physics, University of Illinois Urbana-Champaign, Urbana, IL 61801, USA\\
%\fontsize{9}{9}\upshape \texttt{\{pearson, hidayet2, weiren2, lgurel, w-hwu\}}@illinois.edu} \fontsize{9}{9}\upshape \{pearson, hidayet2, weiren2, lgurel, w-hwu\}@illinois.edu}
\fontsize{9}{9}\upshape pearson@illinois.edu}
\begin{document} \begin{document}
\maketitle \maketitle
@@ -33,7 +32,7 @@ The multilevel fast multiple method (MLFMM) is a key tool for efficiently solvin
The problems are solved using volume integral equations instead of conversion into a corresponding surface-scattering problem through the equivalence principle to support highly inhomogeneous media. The problems are solved using volume integral equations instead of conversion into a corresponding surface-scattering problem through the equivalence principle to support highly inhomogeneous media.
The MLFMM implementation for two-dimensional volumetric scattering problems is realized through matrix operations optimized with shared memory tiling, register tiling, and thread coarsening. The MLFMM implementation for two-dimensional volumetric scattering problems is realized through matrix operations optimized with shared memory tiling, register tiling, and thread coarsening.
MPI communications are overlapped with GPU kernels to achieve high multi-node parallel efficiency. MPI communications are overlapped with GPU kernels to achieve high multi-node parallel efficiency.
The MLFMM is evaluated on current- and next-generation GPU-accelerated supercomputing nodes, where up to 969x speedup is achieved over single-thread CPU execution using 4 NVIDIA P100 graphics processing units. The MLFMM is evaluated on current- and next-generation GPU-accelerated supercomputing nodes, where up to 969x speedup is achieved over sequential CPU execution using 4 NVIDIA P100 graphics processing units.
\end{abstract} \end{abstract}
@@ -42,13 +41,13 @@ The MLFMM is evaluated on current- and next-generation GPU-accelerated supercomp
MLFMM computes pairwise interactions between pixels in the scattering problem by hierarchically clustering pixels into a spatial quad-tree. In the nearfield phase, nearby pixel interactions are computed within the lowest level of the MLFMM tree. The aggregation and disaggregation phases propagate interactions up and down the tree, and the translation phase propagates long-range interactions within a level. In this way, $\mathcal{O}(N)$ work for $N^2$ interactions is achieved for $N$ pixels~\cite{chew01}. The multilevel fast multipole method (MLFMM) computes pairwise interactions between pixels in the scattering problem by hierarchically clustering pixels into a spatial quad-tree. In the nearfield phase, nearby pixel interactions are computed within the lowest level of the MLFMM tree. The aggregation and disaggregation phases propagate interactions up and down the tree, and the translation phase propagates long-range interactions within each level. In this way, $\mathcal{O}(N)$ work for $N^2$ interactions is achieved for $N$ pixels~\cite{chew01}.
Even with algorithmic speedup, high performance parallel MLFMM is needed to take advantage of high-performancing computing resources. Even with algorithmic speedup, high performance parallel MLFMM is needed to take advantage of high-performancing computing resources.
This work presents how a GPU-accelerated MLFMM effectively scales from current to next-generation computers. This work presents how a GPU-accelerated MLFMM effectively scales from current to next-generation computers.
In order to achieve an efficient implementation on graphics processing units (GPUs), these four MLFMM phases are formulated as matrix multiplications. In order to achieve an efficient implementation on graphics processing units (GPUs), these four MLFMM phases are formulated as matrix multiplications.
Common operators are pre-computed, moved to the GPU, and reused as needed to avoid host-device data transfer. Common operators are pre-computed, moved to the GPU, and reused as needed to avoid host-device data transfer.
The MLFMM tree structure is partitioned among message passing interface (MPI) processes where each process employs a single GPU for performing partial multiplications. The MLFMM tree structure is partitioned among message passing interface (MPI) processes, where each process employs a single GPU for performing partial multiplications.
During the MLFMM multiplications, data is transferred between GPUs through their owning MPI processes by moving the data from GPUs to central processing units (CPUs), CPUs to CPUs through MPI, and then from CPUs to GPUs. During the MLFMM multiplications, data is transferred between GPUs through their owning MPI processes by moving the data from GPUs to central processing units (CPUs), CPUs to CPUs through MPI, and then from CPUs to GPUs.
To hide this communication cost, MPI communication is overlapped with GPU kernels. To hide this communication cost, MPI communication is overlapped with GPU kernels.
This strategy completely hides the communication cost and provides 96\%, MPI parallelization efficiency on up to 16 GPUs. This strategy completely hides the communication cost and provides 96\%, MPI parallelization efficiency on up to 16 GPUs.
@@ -61,23 +60,7 @@ This section presents an analysis of the performance of the MLFMM algorithm on d
\subsection{Evaluation Environments} \subsection{Evaluation Environments}
%\begin{table}{}
%\centering \caption{Evaluation Systems} \label{tab:systems}
%\begin{tabular}{|c|c|c|c|}
%\hline & \textbf{XK Node} & \textbf{XE Node} & \textbf{S822LC} \\
%\hline
%\hline \textbf{CPU 1} & AMD Opteron 6276 & AMD Opteron 6276 & IBM Power8 \\
%\hline \textbf{CPU 2} & -- & AMD Opteron 6276 & IBM Power8 \\
%\hline
%\hline \textbf{GPU 1} & \makecell{K20X \\ (6 GB RAM) } & -- & P100 (16GB RAM) \\
%\hline \textbf{GPU 2} & -- & -- & P100 (16GB RAM) \\
%\hline \textbf{GPU 3} & -- & -- & P100 (16GB RAM) \\
%\hline \textbf{GPU 4} & -- & -- & P100 (16GB RAM) \\
%\hline \textbf{RAM} & 32GB & 64 GB & 512 GB \\
%\hline \makecell{\textbf{CPU-GPU} \\ \textbf{Bus}} & PCIe & -- & NVLink \\
%\hline
%\end{tabular}
%\end{table}
The performance of MLFMM is evaluated on three systems: XE and XK nodes from the Blue Waters supercomputer~\cite{ncsa}, and an IBM S822LC. The performance of MLFMM is evaluated on three systems: XE and XK nodes from the Blue Waters supercomputer~\cite{ncsa}, and an IBM S822LC.
Each Blue Waters node is a two-socket system: the XE node has two AMD Opteron 6276 CPUs, each with eight floating-point units, hardware support for 16 executing threads, and 32 GB of RAM. Each Blue Waters node is a two-socket system: the XE node has two AMD Opteron 6276 CPUs, each with eight floating-point units, hardware support for 16 executing threads, and 32 GB of RAM.
@@ -100,7 +83,7 @@ It also has four NVIDIA P100 GPUs with 16 GB of RAM each.
\caption{ \caption{
Amount of application time spent in MLFMM for a 32-thread CPU run on an XE node (left) and a 160-thread run on S822LC (right). Amount of application time spent in MLFMM for a 32-thread CPU run on an XE node (left) and a 160-thread run on S822LC (right).
MLFMM is the dominant application component even with CPU parallelization. MLFMM is the dominant application component even with CPU parallelization.
As the number of pixels grow larger, MLFMM time further increases as a proportion of application time. As the number of pixels grows larger, MLFMM time further increases as a proportion of application time.
} }
\label{fig:app_breakdown} \label{fig:app_breakdown}
\end{figure} \end{figure}
@@ -160,9 +143,8 @@ A 16-GPU MPI execution is not shown, as only one S822LC was available for evalua
Both XE and S822LC achieve more CPU speedup than they have floating-point units (17x with 32 threads on 16 units for XE, 26x with 160 threads on 20 units for S822LC). Both XE and S822LC achieve more CPU speedup than they have floating-point units (17x with 32 threads on 16 units for XE, 26x with 160 threads on 20 units for S822LC).
When floating-point units are oversubscribed, they are more fully utilized. When floating-point units are oversubscribed, they are more fully utilized.
The CUDA implementations leverage well-understood techniques for optimizing matrix operations, including hybrid shared-memory and register tiling, and thread coarsening\cite{hwu11} The CUDA implementations leverage hybrid shared-memory and register tiling, and thread coarsening~\cite{hwu11}.
In both systems, using a GPU for MLFMM provides substantial speedup (additional 3.1x on XE/XK, 9.2x on S822LC) over fully utilizing the CPUs. In both systems, using a GPU for MLFMM provides substantial speedup (additional 3.1x on XE/XK, 9.2x on S822LC) over fully utilizing the CPUs.
This speedup justifies the considerable time invested in a CUDA implementation.
Furthermore, nearly linear scaling when using multiple GPUs is also achieved thanks to overlapping all required MPI communication with GPU computation. Furthermore, nearly linear scaling when using multiple GPUs is also achieved thanks to overlapping all required MPI communication with GPU computation.
This corresponds to a reduction in execution time from approximately 33 seconds to 40 milliseconds on XK nodes, and 28 seconds to 29 milliseconds on S822LC. This corresponds to a reduction in execution time from approximately 33 seconds to 40 milliseconds on XK nodes, and 28 seconds to 29 milliseconds on S822LC.
@@ -171,35 +153,34 @@ This reflects the slow pace of single-threaded CPU performance improvement.
On the other hand, the P100 GPU in S822LC provides 4.4x speedup over the K20x in XK. On the other hand, the P100 GPU in S822LC provides 4.4x speedup over the K20x in XK.
On a per-node basis the four GPUs in S822LC provide 17.9 speedup over the single GPU in XK. On a per-node basis the four GPUs in S822LC provide 17.9 speedup over the single GPU in XK.
The nearfield kernel consumes approximately 60\% of the MLFMM time.
The nearfield kernel is the majority of the MLFMM execution time.
The average kernel-execution speedup moving from K20x to P100 is 5.3x, and the disaggregation kernel speedup is the largest, at 8x. The average kernel-execution speedup moving from K20x to P100 is 5.3x, and the disaggregation kernel speedup is the largest, at 8x.
On both K20x and P100, this kernel's performance is limited by the amount of CUDA shared memory it requires. On both K20x and P100, this kernel's performance is limited by the amount of CUDA shared memory it requires.
In S822LC, the newer Pascal GPU architecture provides 64 KB of shared memory per thread-block rather than the 48 KB on XK, which allows more thread-blocks to run concurrently and provide the disproportionate speedup on that machine. In S822LC, the newer Pascal GPU architecture provides 64 KB of shared memory per thread-block rather than the 48 KB on XK, which allows more thread-blocks to run concurrently and provide the disproportionate speedup on that machine.
\section{Conclusions} \section{Conclusions}
This paper presents MLFMM performance results on three types of computer systems: Blue Waters XE and XK nodes, and an IBM S822LC. This paper presents MLFMM performance results on three types of computer systems: Blue Waters XE and XK nodes, and an IBM S822LC.
MLFMM is realized as matrix operations for excellent performance. MLFMM is realized as matrix operations for excellent performance.
Significant CPU speedup on both systems is achieved with OpenMP, and further eclipsed by CUDA implementations that take advantage of well-understood matrix optimization techniques, up to a speedup of 969x over single-threaded CPU execution on S822LC, bringing execution times from seconds to milliseconds even for large problems. Significant CPU speedup on both systems is achieved with OpenMP, and further eclipsed by CUDA implementations that take advantage of well-understood matrix optimization techniques.
On modern GPUs, this speedup justifies the significant CUDA time investment. A speedup of 969x over single-threaded CPU execution is achieved on S822LC, bringing execution times from seconds to milliseconds even for large problems.
This speedup justifies the significant CUDA time investment.
\section*{Acknowledgment} \section*{Acknowledgments}
This work was supported by the NVIDIA GPU Center of Excellence and the NCSA Petascale Improvement Discovery Program (PAID).
\bibliographystyle{IEEEtran} \bibliographystyle{IEEEtran}
\begin{thebibliography}{99} \begin{thebibliography}{99}
\bibitem{chew01} \bibitem{chew01}
W. C. Chew, et al., W. C. Chew, et al.,
\textit{Fast and efficient algorithms in computational electromagnetics} \textit{Fast and efficient algorithms in computational electromagnetics}.
Artech House, Inc., Artech House,
2001 2001
\bibitem{hwu11} \bibitem{hwu11}
W. Hwu, W. Hwu,
\textit{GPU Computing Gems Emerald Edition} \textit{GPU Computing Gems Emerald Edition}.
Elsevier, Elsevier,
2011 2011
@@ -214,7 +195,23 @@ Available: https://bluewaters.ncsa.illinois.edu/hardware-summary.
\vfill \pagebreak \vfill \pagebreak
%\begin{table}{}
%\centering \caption{Evaluation Systems} \label{tab:systems}
%\begin{tabular}{|c|c|c|c|}
%\hline & \textbf{XK Node} & \textbf{XE Node} & \textbf{S822LC} \\
%\hline
%\hline \textbf{CPU 1} & AMD Opteron 6276 & AMD Opteron 6276 & IBM Power8 \\
%\hline \textbf{CPU 2} & -- & AMD Opteron 6276 & IBM Power8 \\
%\hline
%\hline \textbf{GPU 1} & \makecell{K20X \\ (6 GB RAM) } & -- & P100 (16GB RAM) \\
%\hline \textbf{GPU 2} & -- & -- & P100 (16GB RAM) \\
%\hline \textbf{GPU 3} & -- & -- & P100 (16GB RAM) \\
%\hline \textbf{GPU 4} & -- & -- & P100 (16GB RAM) \\
%\hline \textbf{RAM} & 32GB & 64 GB & 512 GB \\
%\hline \makecell{\textbf{CPU-GPU} \\ \textbf{Bus}} & PCIe & -- & NVLink \\
%\hline
%\end{tabular}
%\end{table}
%\subsection{Computation Kernel Breakdown} %\subsection{Computation Kernel Breakdown}