From fa94f717e339e26420bfb0f3757c05e09e2f4252 Mon Sep 17 00:00:00 2001
From: Carl Pearson <me@carlpearson.net>
Date: Sat, 5 Jun 2021 15:23:09 -0600
Subject: [PATCH] add HPEC to 2018 mailthody

---
 content/publication/20180925_mailthody_hpec.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/content/publication/20180925_mailthody_hpec.md b/content/publication/20180925_mailthody_hpec.md
index 52c1943..f4fdea1 100644
--- a/content/publication/20180925_mailthody_hpec.md
+++ b/content/publication/20180925_mailthody_hpec.md
@@ -1,5 +1,5 @@
 +++
-title = "Collaborative (CPU+ GPU) Algorithms for Triangle Counting and Truss Decomposition"
+title = "[HPEC] Collaborative (CPU+ GPU) Algorithms for Triangle Counting and Truss Decomposition"
 date = 2018-09-25
 draft = false
 tags = ["pangolin"]
@@ -8,7 +8,7 @@ tags = ["pangolin"]
 
 **Vikram S. Mailthody, Ketan Date, Zaid Qureshi, Carl Pearson, Rakesh Nagi, Jinjun Xiong, Wen-Mei Hwu**
 
-In *2018 IEEE High Performance extreme Computing Conference*
+In *2018 IEEE High Performance Extreme Computing Conference*
 
 In this paper, we present an update to our previous submission  from  Graph  Challenge  2017.  This  work  describes and evaluates new software algorithm optimizations undertaken for our 2018 year submission on Collaborative CPU+GPU Algorithms for Triangle Counting and Truss Decomposition. First, we describe four major optimizations for the triangle counting which improved performance by up to 117x over our prior submission. Additionally,  we  show  that  our triangle-counting  algorithm  is on average 151.7x faster than NVIDIA’s NVGraph library (max 476x)  for  SNAP  datasets.  Second,  we  propose  a  novel  parallel k-truss  decomposition  algorithm  that  is  time-efficient  and  is  up to 13.9x faster than our previous submission. Third, we evaluate the  effect  of  generational  hardware  improvements  between  the IBM  “Minsky”  (POWER8,  P100,  NVLink  1.0)  and  “Newell” (POWER9,  V100,  NVLink  2.0)  platforms.  Lastly,  the  software optimizations presented in this work and the hardware improvements  in  the  Newell  platform  enable  analytics  and  discovery  on large graphs  with millions of nodes  and billions of edges  in less than a minute. In sum, the new algorithmic implementations are significantly  faster  and  can  handle  much  larger  “big”  graphs.