From d924f2963b0be25431bc6d56c9b7947ba098b360 Mon Sep 17 00:00:00 2001
From: Carl Pearson <carl.w.pearson@gmail.com>
Date: Fri, 13 Mar 2020 17:31:55 -0600
Subject: [PATCH] stencil paper revision

---
 content/publication/2020522_pearson_iwapt/index.md | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/content/publication/2020522_pearson_iwapt/index.md b/content/publication/2020522_pearson_iwapt/index.md
index ea336c2..acb8970 100644
--- a/content/publication/2020522_pearson_iwapt/index.md
+++ b/content/publication/2020522_pearson_iwapt/index.md
@@ -26,9 +26,10 @@ abstract = """
 High-performance distributed computing systems increasingly feature nodes that have multiple CPU sockets and multiple GPUs.
 The communication bandwidth between these components is non-uniform.
 Furthermore, these systems can expose different communication capabilities between these components.
-For communication-heavy applications, optimally using these capabilities is challenging and essential for performance.
-This work presents approaches for automatic data placement and communication implementation for 3D stencil codes on multi-GPU nodes with non-homogeneous communication performance and capabilities.
-Benchmarking results in the Summit system show that choices in placement can result in a 20% improvement in single-node exchange, and communication specialization canyield a further 6x improvement in exchange time in a single node, and a 16% improvement at 1536 GPUs"""
+For communication-heavy applications, optimally using these capabilities is challenging and essential for performance. 
+Bespoke codes with optimized communication may be non-portable across run-time/software/hardware configurations, and existing stencil frameworks neglect optimized communication.
+This work presents node-aware approaches for automatic data placement and communication implementation for 3D stencil codes on multi-GPU nodes with non-homogeneous communication performance and capabilities.
+Benchmarking results in the Summit system show that choices in placement can result in a 20% improvement in single-node exchange, and communication specialization can yield a further 6x improvement in exchange time in a single node, and a 16% improvement at 1536 GPUs."""
 abstract_short = ""