From 3a685bf1a6af317407792273fcb7fcf84b403ca3 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Wed, 27 Jan 2021 17:40:20 -0700 Subject: [PATCH] update anatole, work on publications, add talks list --- assets/css/custom.css | 4 +- config/_default/config.toml | 44 ++++++---- content/about.md | 35 +------- content/project/app_studies.md | 5 +- content/project/gpu_ci/index.md | 6 +- .../{2014chen.md => 20140601_chen.md} | 8 +- content/publication/2016dakkak.md | 58 +++---------- .../publication/20170621_hidayetoglu_cem.md | 50 +++-------- content/publication/20170621_hwu_cem.md | 85 ++----------------- content/publication/20170621_pearson_cem.md | 69 +++------------ content/publication/20170910_hwu_bsc.md | 81 ------------------ content/publication/20171108_hwu_icrc.md | 22 +---- content/publication/2017dakkak.md | 78 ++++------------- content/publication/2017hidayetoglu.md | 47 ++-------- .../publication/20180521_hidayetoglu_ipdps.md | 6 +- content/publication/20180625_pearson_ms.md | 65 +++----------- content/publication/20180628_pearson_iwoph.md | 6 +- content/publication/20180919_pearson_arxiv.md | 2 + .../publication/20180925_mailthody_hpec.md | 6 +- .../20190410_pearson_icpe/index.md | 80 +++-------------- .../20190926_almasri_hpec/index.md | 16 ++-- .../publication/20190926_huang_hpec/index.md | 7 +- .../20190926_pearson_hpec/index.md | 74 +++------------- .../20200522_pearson_iwapt/index.md | 19 ++--- .../20200923_hidayetoglu_hpec/index.md | 25 +++--- .../20201229_pearson_arxiv/index.md | 72 +++------------- content/talk/20170910_hwu_bsc.md | 12 +++ themes/anatole | 2 +- 28 files changed, 204 insertions(+), 780 deletions(-) rename content/publication/{2014chen.md => 20140601_chen.md} (86%) delete mode 100644 content/publication/20170910_hwu_bsc.md create mode 100644 content/talk/20170910_hwu_bsc.md diff --git a/assets/css/custom.css b/assets/css/custom.css index 4d181ec..a951750 100644 --- a/assets/css/custom.css +++ b/assets/css/custom.css @@ -1,5 +1,5 @@ /* fix post names being too long*/ -.list-with-title .listing .listing-post a { +/* .list-with-title .listing .listing-post a { max-width:90%; display: inline-block; -} \ No newline at end of file +} */ \ No newline at end of file diff --git a/config/_default/config.toml b/config/_default/config.toml index 9586de1..601b557 100644 --- a/config/_default/config.toml +++ b/config/_default/config.toml @@ -90,18 +90,6 @@ url = "https://scholar.google.com/citations?user=K2nzqpYAAAAJ&hl=en" identifier = "about" url = "/about/" - [[menu.main]] - name = "Recognition" - weight = 200 - identifier = "awards" - url = "/awards/" - - [[menu.main]] - name = "Experience" - weight = 200 - identifier = "experience" - url = "/experience/" - [[menu.main]] name = "Publications" weight = 300 @@ -109,13 +97,33 @@ url = "https://scholar.google.com/citations?user=K2nzqpYAAAAJ&hl=en" url = "/publication/" [[menu.main]] - name = "Posts" + name = "Projects" weight = 400 + identifier = "projects" + url = "/project/" + + [[menu.main]] + name = "Recognition" + weight = 500 + identifier = "awards" + url = "/awards/" + + [[menu.main]] + name = "Experience" + weight = 600 + identifier = "experience" + url = "/experience/" + + [[menu.main]] + name = "Talks" + weight = 700 + identifier = "talks" + url = "/talk/" + + [[menu.main]] + name = "Posts" + weight = 800 identifier = "posts" url = "/post/" - [[menu.main]] - name = "Projects" - weight = 500 - identifier = "projects" - url = "/project/" + diff --git a/content/about.md b/content/about.md index 91c9b1f..f748f54 100644 --- a/content/about.md +++ b/content/about.md @@ -10,37 +10,4 @@ I am a PhD candidate in the Electrical and Computer Engineering department at th I am working on multi-GPU communication and scaling as part of the joint UIUC / IBM C3SR cognitive computing systems research center. The focus of these activities is to apply tools and techniques developed in the IMPACT group to improve the performance of real-world applications. -## Teaching - -- 2018 Spring University of Illinois Project TA for ECE408/CS483 -- 2017 Fall University of Illinois Head TA for ECE408/CS483 -- 2017-2018 University of Illinois [Mavis Future Faculty Fellow](http://publish.illinois.edu/engr-mavis/2017-2018-mavis-fellows/). -- 2015 Fall University of Illinois TA for ECE408 - -I have been a teaching assistant for the following courses: - -- ECE408/CS483: Heterogeneous Parallel Programming at the University of Illinois -- E155: Microprocesser-based Systems: Design & Applications at Harvey Mudd College -- E85: Digital Electronics and Computer Architecture at Harvey Mudd College - -I have also been a teaching assistant for the Programming and Tuning Massively Parallel Systems -[(PUMPS)](http://bcw.ac.upc.edu/PUMPS2017/) summer school in Barcelona since 2014. - -I have also mentored undergraduates and a high school student, who is a co-author on two papers. - -During the Mavis fellowship, I administered the ECE 408 GPU programming project in spring 2018. I created - -* Four lectures on machine learning ([1][l1], [2][l2], [3][l3], [4][l4]) -* A [course project][project_repo] where students add a GPU convolution operator to MXNet. -* Project kickoff [slides][kickoff-slides] ([repo][kickoff-repo]). - -[project_repo]: https://github.com/illinois-impact/ece408_project -[l1]: ../../pdf/2017FA_ECE408_dl01_Intro.pdf -[l2]: ../../pptx/2017FA_ECE408_dl02_FF-Gradient.pptx -[l3]: ../../pptx/2017FA_ECE408_dl03_CNN01.pptx -[l4]: ../../pptx/2017FA_ECE408_dl04_CNN02.pptx -[kickoff-slides]: ../../pdf/2017FA_ECE408_project-kickoff.pdf -[kickoff-repo]: https://github.com/illinois-impact/ece408_project-kickoff-slides - -I also created a set of resources on using Nvidia's Nsight Compute and Nsight Systems performance profiling tools, including a 75 minute recorded lecture. -See the [Github repository](https://github.com/cwpearson/nvidia-performance-tools) to get started. \ No newline at end of file +Professional inquiries should be directed to `` at `illinois.edu`. \ No newline at end of file diff --git a/content/project/app_studies.md b/content/project/app_studies.md index 8a04810..2002375 100644 --- a/content/project/app_studies.md +++ b/content/project/app_studies.md @@ -3,10 +3,9 @@ date = "2017-03-13T04:40:57-05:00" title = "High-Performance Application Studies" external_link = "" image_preview = "" -summary = "Tools and Techniques for Code Acceleration" highlight = true math = false -tags = ["impact","c3sr"] +tags = ["applications"] [header] caption = "" @@ -14,6 +13,8 @@ tags = ["impact","c3sr"] +++ +### Tools and Techniques for Code Acceleration + I worked with Abdul Dakkak and Cheng Li at Illinois on a full-stack performance analasis platform for machine learning software. The same infrastructure is also used to deliver some course projects for GPU programming courses. I worked with Mert Hidayetoglu at Illinois on GPU acceleration of a massively-parallel tomographic image reconstruction code. diff --git a/content/project/gpu_ci/index.md b/content/project/gpu_ci/index.md index 16155b5..e1862af 100644 --- a/content/project/gpu_ci/index.md +++ b/content/project/gpu_ci/index.md @@ -1,14 +1,12 @@ --- title: GPU Continuous Integration with Azure Pipelines -summary: tags: - personal date: "2016-04-27T00:00:00Z" -# Optional external URL for project (replaces project detail page). -external_link: "https://www.github.com/cwpearson/azure-pipelines-agent" - image: caption: focal_point: Smart --- + +* [github](https://www.github.com/cwpearson/azure-pipelines-agent) \ No newline at end of file diff --git a/content/publication/2014chen.md b/content/publication/20140601_chen.md similarity index 86% rename from content/publication/2014chen.md rename to content/publication/20140601_chen.md index ae779fa..dee2522 100644 --- a/content/publication/2014chen.md +++ b/content/publication/20140601_chen.md @@ -1,8 +1,8 @@ +++ author = "Carl Pearson" -title = "[MES'14] Adaptive Cache Bypass and Insertion for Many-Core Accelerators" -date = 2014-01-01 -description = "Guide to Thumbnails in Hugo" +title = "[MES] Adaptive Cache Bypass and Insertion for Many-Core Accelerators" +date = 2014-06-01 +description = "" tags = [] thumbnail= "" @@ -11,7 +11,7 @@ draft = false **Xuhao Chen, Shengzhao Wu, Li-Wen Chang, Wei-Sheng Huang, Carl Pearson, Wen-mei Hwu** -*Proceedings of International Workshop on Manycore Embedded Systems.* +In *Proceedings of International Workshop on Manycore Embedded Systems.* Many-core accelerators, e.g. GPUs, are widely used for accelerating general-purpose compute kernels. With the SIMT execution model, GPUs can hide memory latency through massive multithreading for many regular applications. To support more applications with irregular memory access pattern, cache hierarchy is introduced to GPU architecture to capture input data sharing and mitigate the effect of irregular accesses. However, GPU caches suffer from poor efficiency due to severe contention, which makes it difficult to adopt heuristic management policies, and also limits system performance and energy-efficiency. We propose an adaptive cache management policy specifically for many-core accelerators. The tag array of L2 cache is enhanced with extra bits to track memory access history, an thus the locality information is captured and provided to L1 cache as heuristics to guide its run-time bypass and insertion decisions. By preventing un-reused data from polluting the cache and alleviating contention, cache efficiency is significantly improved. As a result, the system performance is improved by 31% on average for cache sensitive benchmarks, compared to the baseline GPU architecture. diff --git a/content/publication/2016dakkak.md b/content/publication/2016dakkak.md index e9295c8..43891df 100644 --- a/content/publication/2016dakkak.md +++ b/content/publication/2016dakkak.md @@ -1,52 +1,18 @@ +++ +author = "Carl Pearson" +title = "[IPDPSw] WebGPU: A Scalable Online Development Platform for GPU Programming Courses" +date = 2016-05-23 +description = "" +tags = ["teaching"] +thumbnail= "" + draft = false ++++ -date = "2016-01-01" -title = "WebGPU: A Scalable Online Development Platform for GPU Programming Courses" -authors = ["Adbul Dakkak", "Carl Pearson", "Cheng Li"] +**Adbul Dakkak, Carl Pearson, Cheng Li** -# Publication type. -# Legend: -# 0 = Uncategorized -# 1 = Conference paper -# 2 = Journal article -# 3 = Manuscript -# 4 = Report -# 5 = Book -# 6 = Book section -publication_types = ["1"] +In *Parallel and Distributed Processing Symposium Workshops, 2016 IEEE International.* -abstract = 'The popularity of computer science classes offered through Massive Open On-line Courses (MOOCs) creates both opportunities and challenges. Programming-based classes need to provide consistent development infrastructures that are both scalable and user friendly to students. The \"Heterogeneous Parallel Programming\" class offered through Coursera teaches GPU programming and encountered these problems. We developed WebGPU - an online GPU development platform - providing students with a user friendly scalable GPU computing platform throughout the course. It has been used as the CUDA, OpenACC, and OpenCL programming environment for large Coursera courses, short-running summer schools, and traditional semester-long graduate and undergraduate courses. WebGPU has since replaced our traditional development infrastructure for the GPU classes offered at UIUC. This paper presents the original, revised, and upcoming WebGPU designs that address the requirements and challenges of offering sophisticated computing resources to a large, quickly-varying number of students.' +The popularity of computer science classes offered through Massive Open On-line Courses (MOOCs) creates both opportunities and challenges. Programming-based classes need to provide consistent development infrastructures that are both scalable and user friendly to students. The \"Heterogeneous Parallel Programming\" class offered through Coursera teaches GPU programming and encountered these problems. We developed WebGPU - an online GPU development platform - providing students with a user friendly scalable GPU computing platform throughout the course. It has been used as the CUDA, OpenACC, and OpenCL programming environment for large Coursera courses, short-running summer schools, and traditional semester-long graduate and undergraduate courses. WebGPU has since replaced our traditional development infrastructure for the GPU classes offered at UIUC. This paper presents the original, revised, and upcoming WebGPU designs that address the requirements and challenges of offering sophisticated computing resources to a large, quickly-varying number of students. - -math = false -publication = "In *Parallel and Distributed Processing Symposium Workshops, 2016 IEEE International.*" -publication_short = "IN *IPDPS*" - -url_code = "" -url_dataset = "" -url_pdf = "pdf/2016dakkak.pdf" -url_project = "" -url_slides = "" -url_video = "" - -selected = true - -# Projects (optional). -# Associate this publication with one or more of your projects. -# Simply enter your project's folder or file name without extension. -# E.g. `projects = ["deep-learning"]` references -# `content/project/deep-learning/index.md`. -# Otherwise, set `projects = []`. -projects = ["teaching_tools"] - -# Featured image -# To use, add an image named `featured.jpg/png` to your page's folder. -[image] - # Caption (optional) - caption = "" - - # Focal point (optional) - # Options: Smart, Center, TopLeft, Top, TopRight, Left, Right, BottomLeft, Bottom, BottomRight - focal_point = "" -+++ \ No newline at end of file +* [pdf](/pdf/2016dakkak.pdf) \ No newline at end of file diff --git a/content/publication/20170621_hidayetoglu_cem.md b/content/publication/20170621_hidayetoglu_cem.md index 20492c6..78f46f0 100644 --- a/content/publication/20170621_hidayetoglu_cem.md +++ b/content/publication/20170621_hidayetoglu_cem.md @@ -2,51 +2,23 @@ draft = false date = "2017-06-21" -title = "Scalable Parallel DBIM Solutions of Inverse-Scattering Problems" -authors = ["Mert Hidayetoglu", "Carl Pearson", "Levent Gurel", "Wen-mei Hwu", "Weng Cho Chew"] - -# Publication type. -# Legend: -# 0 = Uncategorized -# 1 = Conference paper -# 2 = Journal article -# 3 = Manuscript -# 4 = Report -# 5 = Book -# 6 = Book section -publication_types = ["1"] - - -abstract = 'We report scalable solutions of inverse-scattering problems with the distorted Born iterative method (DBIM) on large number of computing nodes. Distributing forward solutions does not scale well when the number of illuminations is not greater than the number of computing nodes. As a remedy, we distribute both forward solutions and the corresponding forward solvers to improve granularity of DBIM solutions. This paper provides a set of solutions demonstrating good scaling of the proposed parallelization strategy up to 1,024 computing nodes, employing 16,394 processing cores in total.' - +title = "[CME] Scalable Parallel DBIM Solutions of Inverse-Scattering Problems" math = false publication = "Computing and Electromagnetics International Workshop (CEM), 2017" -url_code = "" -url_dataset = "" -url_pdf = "pdf/20170621_hidayetoglu_cem.pdf" -url_project = "" -url_slides = "" -url_video = "" + selected = false -# Projects (optional). -# Associate this publication with one or more of your projects. -# Simply enter your project's folder or file name without extension. -# E.g. `projects = ["deep-learning"]` references -# `content/project/deep-learning/index.md`. -# Otherwise, set `projects = []`. -projects = ["app_studies"] +tags = ["applications"] -# Featured image -# To use, add an image named `featured.jpg/png` to your page's folder. -[image] - # Caption (optional) - caption = "" ++++ - # Focal point (optional) - # Options: Smart, Center, TopLeft, Top, TopRight, Left, Right, BottomLeft, Bottom, BottomRight - focal_point = "" -+++ \ No newline at end of file +**Mert Hidayetoglu, Carl Pearson, Levent Gurel, Wen-mei Hwu, Weng Cho Chew** + +In *Computing and Electromagnetics International Workshop (CEM), 2017* + +We report scalable solutions of inverse-scattering problems with the distorted Born iterative method (DBIM) on large number of computing nodes. Distributing forward solutions does not scale well when the number of illuminations is not greater than the number of computing nodes. As a remedy, we distribute both forward solutions and the corresponding forward solvers to improve granularity of DBIM solutions. This paper provides a set of solutions demonstrating good scaling of the proposed parallelization strategy up to 1,024 computing nodes, employing 16,394 processing cores in total. + +* [pdf](/pdf/20170621_hidayetoglu_cem.pdf) \ No newline at end of file diff --git a/content/publication/20170621_hwu_cem.md b/content/publication/20170621_hwu_cem.md index 670750e..f4448d1 100644 --- a/content/publication/20170621_hwu_cem.md +++ b/content/publication/20170621_hwu_cem.md @@ -1,81 +1,14 @@ +++ -title = "Thoughts on Massively-Parallel Heterogeneous Computing for Solving Large Problems" +title = "[CEM] Thoughts on Massively-Parallel Heterogeneous Computing for Solving Large Problems" date = 2017-06-21 draft = false -# Authors. Comma separated list, e.g. `["Bob Smith", "David Jones"]`. -authors = ["Wen-mei Hwu", "Mert Hidayetoglu", "Weng Cho Chew", "Carl Pearson", "Simon Garcia de Gonzalo", "Sitao Huang", "Abdul Dakkak"] - -# Publication type. -# Legend: -# 0 = Uncategorized -# 1 = Conference paper -# 2 = Journal article -# 3 = Manuscript -# 4 = Report -# 5 = Book -# 6 = Book section -publication_types = ["1"] - -# Publication name and optional abbreviated version. -publication = "In *Computing and Electromagnetics International Workshop*." -publication_short = "In *CEM*" - -# Abstract and optional shortened version. -abstract = "In this paper, we present our view of massively-parallel heterogeneous computing for solving large scientific problems. We start by observing that computing has been the primary driver of major innovations since the beginning of the 21st century. We argue that this is the fruit of decades of progress in computing methods, technology, and systems. A high-level analysis on out-scaling and up-scaling on large supercomputers is given through a time-domain wave-scattering simulation example. The importance of heterogeneous node architectures for good up-scaling is highlighted. A case for low-complexity algorithms is made for continued scale-out towards exascale systems." -abstract_short = "" - -# Is this a selected publication? (true/false) -selected = false - -# Projects (optional). -# Associate this publication with one or more of your projects. -# Simply enter your project's folder or file name without extension. -# E.g. `projects = ["deep-learning"]` references -# `content/project/deep-learning/index.md`. -# Otherwise, set `projects = []`. -projects = ["app_studies"] - -# Slides (optional). -# Associate this publication with Markdown slides. -# Simply enter your slide deck's filename without extension. -# E.g. `slides = "example-slides"` references -# `content/slides/example-slides.md`. -# Otherwise, set `slides = ""`. -slides = "example-slides" - -# Tags (optional). -# Set `tags = []` for no tags, or use the form `tags = ["A Tag", "Another Tag"]` for one or more tags. -tags = [] - -# Links (optional). -url_pdf = "pdf/20170621_hwu_cem.pdf" -url_preprint = "" -url_code = "" -url_dataset = "" -url_project = "" -url_slides = "" -url_video = "" -url_poster = "" -url_source = "" - -# Custom links (optional). -# Uncomment line below to enable. For multiple links, use the form `[{...}, {...}, {...}]`. -# url_custom = [{name = "Custom Link", url = "http://example.org"}] - -# Digital Object Identifier (DOI) -doi = "" - -# Does this page contain LaTeX math? (true/false) -math = false - -# Featured image -# To use, add an image named `featured.jpg/png` to your page's folder. -[image] - # Caption (optional) - caption = "Image credit: [**Unsplash**](https://unsplash.com/photos/jdD8gXaTZsc)" - - # Focal point (optional) - # Options: Smart, Center, TopLeft, Top, TopRight, Left, Right, BottomLeft, Bottom, BottomRight - focal_point = "" +++ + +**Wen-mei Hwu, Mert Hidayetoglu, Weng Cho Chew, Carl Pearson, Simon Garcia de Gonzalo, Sitao Huang, Abdul Dakkak** + +In *Computing and Electromagnetics International Workshop 2017* + +In this paper, we present our view of massively-parallel heterogeneous computing for solving large scientific problems. We start by observing that computing has been the primary driver of major innovations since the beginning of the 21st century. We argue that this is the fruit of decades of progress in computing methods, technology, and systems. A high-level analysis on out-scaling and up-scaling on large supercomputers is given through a time-domain wave-scattering simulation example. The importance of heterogeneous node architectures for good up-scaling is highlighted. A case for low-complexity algorithms is made for continued scale-out towards exascale systems. + +* [pdf](/pdf/20170621_hwu_cem.pdf) \ No newline at end of file diff --git a/content/publication/20170621_pearson_cem.md b/content/publication/20170621_pearson_cem.md index 04234d2..e739271 100644 --- a/content/publication/20170621_pearson_cem.md +++ b/content/publication/20170621_pearson_cem.md @@ -1,69 +1,20 @@ +++ -title = "Comparative Performance Evaluation of Multi-GPU MLFMM Implementation for 2-D VIE Problems" +title = "[CEM] Comparative Performance Evaluation of Multi-GPU MLFMM Implementation for 2-D VIE Problems" date = 2017-06-21 draft = false -# Authors. Comma separated list, e.g. `["Bob Smith", "David Jones"]`. -authors = ["Carl Pearson", "Mert Hidayetoglu", "Wei Ren", "Weng Cho Chew", "Wen-Mei Hwu"] - -# Publication type. -# Legend: -# 0 = Uncategorized -# 1 = Conference proceedings -# 2 = Journal -# 3 = Work in progress -# 4 = Technical report -# 5 = Book -# 6 = Book chapter -publication_types = ["1"] - -# Publication name and optional abbreviated version. -publication = "In *Computing and Electromagnetics International Workshop, IEEE 2017*" -publication_short = "In CEM" - -# Abstract and optional shortened version. -abstract = "We compare multi-GPU performance of the multilevel fast multipole method (MLFMM) on two different systems: A shared-memory IBM S822LC workstation with four NVIDIA P100 GPUs, and 16 XK nodes (each is employed with a single NVIDIA K20X GPU) of the Blue Waters supercomputer. MLFMM is implemented for solving scattering problems involving two-dimensional inhomogeneous bodies. Results show that the multi-GPU implementation provides 794 and 969 times speedups on the IBM and Blue Waters systems over their corresponding sequential CPU executions, respectively, where the sequential execution on the IBM system is 1.17 times faster than on the Blue Waters System." -abstract_short = "" - -# Does this page contain LaTeX math? (true/false) -math = false - -# Does this page require source code highlighting? (true/false) -highlight = true - -# Featured image thumbnail (optional) -image_preview = "" - -# Is this a selected publication? (true/false) -selected = false - -# Projects (optional). -# Associate this publication with one or more of your projects. -# Simply enter your project's folder or file name without extension. -# E.g. `projects = ["deep-learning"]` references -# `content/project/deep-learning/index.md`. -# Otherwise, set `projects = []`. -projects = ["app_studies"] +tags = ["applications"] # Links (optional) url_pdf = "pdf/20170621_pearson_cem.pdf" -url_preprint = "" -url_code = "" -url_dataset = "" -url_project = "" -url_slides = "" -url_video = "" -url_poster = "" -url_source = "" -# Featured image -# To use, add an image named `featured.jpg/png` to your page's folder. -[image] - # Caption (optional) - caption = "" - - # Focal point (optional) - # Options: Smart, Center, TopLeft, Top, TopRight, Left, Right, BottomLeft, Bottom, BottomRight - focal_point = "" +++ + +**Carl Pearson, Mert Hidayetoglu, Wei Ren, Weng Cho Chew, Wen-Mei Hwu** + +In *Computing and Electromagnetics International Workshop, IEEE 2017* + +We compare multi-GPU performance of the multilevel fast multipole method (MLFMM) on two different systems: A shared-memory IBM S822LC workstation with four NVIDIA P100 GPUs, and 16 XK nodes (each is employed with a single NVIDIA K20X GPU) of the Blue Waters supercomputer. MLFMM is implemented for solving scattering problems involving two-dimensional inhomogeneous bodies. Results show that the multi-GPU implementation provides 794 and 969 times speedups on the IBM and Blue Waters systems over their corresponding sequential CPU executions, respectively, where the sequential execution on the IBM system is 1.17 times faster than on the Blue Waters System. + +* [pdf](/pdf/20170621_pearson_cem.pdf) \ No newline at end of file diff --git a/content/publication/20170910_hwu_bsc.md b/content/publication/20170910_hwu_bsc.md deleted file mode 100644 index c946827..0000000 --- a/content/publication/20170910_hwu_bsc.md +++ /dev/null @@ -1,81 +0,0 @@ -+++ -title = "Innovative Applications and Technology Pivots - A Perfect Storm in Computing" -date = 2017-09-10 -draft = false - -# Authors. Comma separated list, e.g. `["Bob Smith", "David Jones"]`. -authors = ["Wen-mei Hwu", "Izzat El Hajj", "Simon Garcia de Gonzalo", "Carl Pearson", "Nam Sung Kim", "Deming Chen", "Jinjun Xiong", "Zehra Sura"] - -# Publication type. -# Legend: -# 0 = Uncategorized -# 1 = Conference paper -# 2 = Journal article -# 3 = Manuscript -# 4 = Report -# 5 = Book -# 6 = Book section -publication_types = ["0"] - -# Publication name and optional abbreviated version. -publication = "Barcelona Supercomputing Center" -publication_short = "*BSC*" - -# Abstract and optional shortened version. -abstract = "" -abstract_short = "" - -# Is this a selected publication? (true/false) -selected = false - -# Projects (optional). -# Associate this publication with one or more of your projects. -# Simply enter your project's folder or file name without extension. -# E.g. `projects = ["deep-learning"]` references -# `content/project/deep-learning/index.md`. -# Otherwise, set `projects = []`. -projects = [] - -# Slides (optional). -# Associate this publication with Markdown slides. -# Simply enter your slide deck's filename without extension. -# E.g. `slides = "example-slides"` references -# `content/slides/example-slides.md`. -# Otherwise, set `slides = ""`. -slides = "" - -# Tags (optional). -# Set `tags = []` for no tags, or use the form `tags = ["A Tag", "Another Tag"]` for one or more tags. -tags = [] - -# Links (optional). -url_pdf = "" -url_preprint = "" -url_code = "" -url_dataset = "" -url_project = "" -url_slides = "pdf/20170910_hwu_bsc.pdf" -url_video = "" -url_poster = "" -url_source = "" - -# Custom links (optional). -# Uncomment line below to enable. For multiple links, use the form `[{...}, {...}, {...}]`. -# url_custom = [{name = "Custom Link", url = "http://example.org"}] - -# Digital Object Identifier (DOI) -doi = "" - -# Does this page contain LaTeX math? (true/false) -math = false - -# Featured image -# To use, add an image named `featured.jpg/png` to your page's folder. -[image] - # Caption (optional) - caption = "Image credit: [**Unsplash**](https://unsplash.com/photos/jdD8gXaTZsc)" - - # Focal point (optional) - # Options: Smart, Center, TopLeft, Top, TopRight, Left, Right, BottomLeft, Bottom, BottomRight - focal_point = "" -+++ diff --git a/content/publication/20171108_hwu_icrc.md b/content/publication/20171108_hwu_icrc.md index 9b37938..4490dc1 100644 --- a/content/publication/20171108_hwu_icrc.md +++ b/content/publication/20171108_hwu_icrc.md @@ -3,28 +3,10 @@ title = "Rebooting the Data Access Hierarchy of Computing Systems" date = 2017-11-18 draft = false -# Authors. Comma separated list, e.g. `["Bob Smith", "David Jones"]`. -authors = ["Wen-mei Hwu", "Izzat El Hajj", "Simon Garcia de Gonzalo", "Carl Pearson", "Nam Sung Kim", "Deming Chen", "Jinjun Xiong", "Zehra Sura"] - -# Publication type. -# Legend: -# 0 = Uncategorized -# 1 = Conference paper -# 2 = Journal article -# 3 = Manuscript -# 4 = Report -# 5 = Book -# 6 = Book section -publication_types = ["0"] - # Publication name and optional abbreviated version. publication = "In *Computing and Electromagnetics International Workshop*." publication_short = "In *CEM*" -# Abstract and optional shortened version. -abstract = "In this paper, we present our view of massively-parallel heterogeneous computing for solving large scientific problems. We start by observing that computing has been the primary driver of major innovations since the beginning of the 21st century. We argue that this is the fruit of decades of progress in computing methods, technology, and systems. A high-level analysis on out-scaling and up-scaling on large supercomputers is given through a time-domain wave-scattering simulation example. The importance of heterogeneous node architectures for good up-scaling is highlighted. A case for low-complexity algorithms is made for continued scale-out towards exascale systems." -abstract_short = "" - # Is this a selected publication? (true/false) selected = false @@ -79,3 +61,7 @@ math = false # Options: Smart, Center, TopLeft, Top, TopRight, Left, Right, BottomLeft, Bottom, BottomRight focal_point = "" +++ + +authors = ["Wen-mei Hwu", "Izzat El Hajj", "Simon Garcia de Gonzalo", "Carl Pearson", "Nam Sung Kim", "Deming Chen", "Jinjun Xiong", "Zehra Sura"] + +In this paper, we present our view of massively-parallel heterogeneous computing for solving large scientific problems. We start by observing that computing has been the primary driver of major innovations since the beginning of the 21st century. We argue that this is the fruit of decades of progress in computing methods, technology, and systems. A high-level analysis on out-scaling and up-scaling on large supercomputers is given through a time-domain wave-scattering simulation example. The importance of heterogeneous node architectures for good up-scaling is highlighted. A case for low-complexity algorithms is made for continued scale-out towards exascale systems. \ No newline at end of file diff --git a/content/publication/2017dakkak.md b/content/publication/2017dakkak.md index 79d8601..cd07075 100644 --- a/content/publication/2017dakkak.md +++ b/content/publication/2017dakkak.md @@ -1,66 +1,20 @@ +++ -title = "RAI: A Scalable Project Submission System for Parallel Programming Courses" +author = "Carl Pearson" +title = "[IPDPSw] RAI: A Scalable Project Submission System for Parallel Programming Courses" date = 2017-05-29 +description = "" +tags = ["teaching"] +thumbnail= "" + draft = false - -# Authors. Comma separated list, e.g. `["Bob Smith", "David Jones"]`. -authors = ["Adbul Dakkak", "Carl Pearson", "Cheng Li"] - -# Publication type. -# Legend: -# 0 = Uncategorized -# 1 = Conference paper -# 2 = Journal article -# 3 = Manuscript -# 4 = Report -# 5 = Book -# 6 = Book section -publication_types = ["1"] - -# Publication name and optional abbreviated version. -publication = "*Parallel and Distributed Processing Symposium Workshops, 2017 IEEE International.* IEEE, 2017." -publication_short = "IPDPS Workshop 2017" - -# Abstract and optional shortened version. -abstract = "A major component of many advanced programming courses is an open-ended “end-of-term project” assignment. Delivering and evaluating open-ended parallel programming projects for hundreds or thousands of students brings a need for broad system reconfigurability coupled with challenges of testing and development uniformity, access to esoteric hardware and programming environments, scalability, and security. We present RAI, a secure and extensible system for delivering open-ended programming assignments configured with access to different hardware and software requirements. We describe how the system was used to deliver a programming-competition-style final project in an introductory GPU programming course at the University of Illinois Urbana-Champaign." -abstract_short = "" - -# Does this page contain LaTeX math? (true/false) -math = false - -# Does this page require source code highlighting? (true/false) -highlight = false - -# Is this a selected publication? (true/false) -selected = true - -# Projects (optional). -# Associate this publication with one or more of your projects. -# Simply enter your project's folder or file name without extension. -# E.g. `projects = ["deep-learning"]` references -# `content/project/deep-learning/index.md`. -# Otherwise, set `projects = []`. -projects = ["teaching_tools"] - -# Links (optional) -url_pdf = "pdf/rai-edupar2017.pdf" -url_preprint = "" -url_code = "" -url_dataset = "" -url_project = "" -url_slides = "" -url_video = "" -url_poster = "" -url_source = "" - -# Featured image -# To use, add an image named `featured.jpg/png` to your page's folder. -[image] - # Caption (optional) - caption = "" - - # Focal point (optional) - # Options: Smart, Center, TopLeft, Top, TopRight, Left, Right, BottomLeft, Bottom, BottomRight - focal_point = "" - +++ + + + +**Adbul Dakkak, Carl Pearson, Cheng Li** + +In *Parallel and Distributed Processing Symposium Workshops, 2017 IEEE International.* + +A major component of many advanced programming courses is an open-ended “end-of-term project” assignment. Delivering and evaluating open-ended parallel programming projects for hundreds or thousands of students brings a need for broad system reconfigurability coupled with challenges of testing and development uniformity, access to esoteric hardware and programming environments, scalability, and security. We present RAI, a secure and extensible system for delivering open-ended programming assignments configured with access to different hardware and software requirements. We describe how the system was used to deliver a programming-competition-style final project in an introductory GPU programming course at the University of Illinois Urbana-Champaign. + +* [pdf](/pdf/rai-edupar2017.pdf) \ No newline at end of file diff --git a/content/publication/2017hidayetoglu.md b/content/publication/2017hidayetoglu.md index 7e665da..58a53a6 100644 --- a/content/publication/2017hidayetoglu.md +++ b/content/publication/2017hidayetoglu.md @@ -1,50 +1,21 @@ +++ draft = false -date = "2017-03-28" -title = "Large Inverse-Scattering Solutions with DBIM on GPU-Enabled Supercomputers" +date = 2017-03-28 +title = "[ACES] Large Inverse-Scattering Solutions with DBIM on GPU-Enabled Supercomputers" authors = ["Mert Hidayetoglu", "Carl Pearson", "Weng Cho Chew", "Levent Gurel", "Wen-mei Hwu"] -# Publication type. -# Legend: -# 0 = Uncategorized -# 1 = Conference paper -# 2 = Journal article -# 3 = Manuscript -# 4 = Report -# 5 = Book -# 6 = Book section -publication_types = ["1"] - -abstract = 'We report inverse-scattering solutions on supercomputers involving large numbers of graphics processing units (GPUs). The distorted-Born iterative method (DBIM) is employed for the iterative inversions. In each iteration, the required forward problems are distributed among computing nodes equipped with GPUs, and solved with the multilevel fast multipole algorithm. A tomographic reconstruction of a synthetic object with a linear dimension of one hundred wavelengths is obtained on 256 GPUs. The results show that DBIM obtains images approximately four times faster on GPUs, compared to parallel executions on traditional CPU-only computing nodes.' math = false -publication = "In *Applied and Computational Electromagnetics Symposium, 2017.* For the special session: Big Data Aspects" -url_code = "" -url_dataset = "" -url_pdf = "pdf/2017aces-dbim.pdf" -url_project = "" -url_slides = "" -url_video = "" +tags = ["applications"] -selected = false ++++ -# Projects (optional). -# Associate this publication with one or more of your projects. -# Simply enter your project's folder or file name without extension. -# E.g. `projects = ["deep-learning"]` references -# `content/project/deep-learning/index.md`. -# Otherwise, set `projects = []`. -projects = ["app_studies"] +**Mert Hidayetoglu, Carl Pearson, Weng Cho Chew, Levent Gurel, Wen-mei Hwu** -# Featured image -# To use, add an image named `featured.jpg/png` to your page's folder. -[image] - # Caption (optional) - caption = "" +In *Applied and Computational Electromagnetics Symposium, 2017* - # Focal point (optional) - # Options: Smart, Center, TopLeft, Top, TopRight, Left, Right, BottomLeft, Bottom, BottomRight - focal_point = "" -+++ \ No newline at end of file +We report inverse-scattering solutions on supercomputers involving large numbers of graphics processing units (GPUs). The distorted-Born iterative method (DBIM) is employed for the iterative inversions. In each iteration, the required forward problems are distributed among computing nodes equipped with GPUs, and solved with the multilevel fast multipole algorithm. A tomographic reconstruction of a synthetic object with a linear dimension of one hundred wavelengths is obtained on 256 GPUs. The results show that DBIM obtains images approximately four times faster on GPUs, compared to parallel executions on traditional CPU-only computing nodes. + +* [pdf](/pdf/2017aces-dbim.pdf) \ No newline at end of file diff --git a/content/publication/20180521_hidayetoglu_ipdps.md b/content/publication/20180521_hidayetoglu_ipdps.md index ab7bade..a6eee07 100644 --- a/content/publication/20180521_hidayetoglu_ipdps.md +++ b/content/publication/20180521_hidayetoglu_ipdps.md @@ -21,10 +21,6 @@ publication_types = ["1"] publication = "In *2018 IEEE International Parallel and Distributed Processing Symposium*" publication_short = "In *IPDPS*" -# Abstract and optional shortened version. -abstract = "We present a massively-parallel solver for large Helmholtz-type inverse scattering problems. The solver employs the distorted Born iterative method for capturing the multiple-scattering phenomena in image reconstructions. This method requires many full-wave forward-scattering solutions in each iteration, constituting the main performance bottleneck with its high computational complexity. As a remedy, we use the multilevel fast multipole algorithm (MLFMA). The solver scales among computing nodes using a two-dimensional parallelization strategy that distributes illuminations in one dimension, and MLFMA sub-trees in the other dimension. Multi-core CPUs and GPUs are used to provide per-node speedup. We demonstrate a 76% efficiency when scaling from 64 GPUs to 4,096 GPUs. The paper provides reconstruction of a 204.8λ×204.8λ image (4M unknowns) executed on 4,096 GPUs in near-real time (almost 2 minutes). To the best of our knowledge, this is the largest full-wave inverse scattering solution to date, in terms of both image size and computational resources." -abstract_short = "" - # Does this page contain LaTeX math? (true/false) math = false @@ -67,3 +63,5 @@ url_source = "" focal_point = "" +++ + +We present a massively-parallel solver for large Helmholtz-type inverse scattering problems. The solver employs the distorted Born iterative method for capturing the multiple-scattering phenomena in image reconstructions. This method requires many full-wave forward-scattering solutions in each iteration, constituting the main performance bottleneck with its high computational complexity. As a remedy, we use the multilevel fast multipole algorithm (MLFMA). The solver scales among computing nodes using a two-dimensional parallelization strategy that distributes illuminations in one dimension, and MLFMA sub-trees in the other dimension. Multi-core CPUs and GPUs are used to provide per-node speedup. We demonstrate a 76% efficiency when scaling from 64 GPUs to 4,096 GPUs. The paper provides reconstruction of a 204.8λ×204.8λ image (4M unknowns) executed on 4,096 GPUs in near-real time (almost 2 minutes). To the best of our knowledge, this is the largest full-wave inverse scattering solution to date, in terms of both image size and computational resources. \ No newline at end of file diff --git a/content/publication/20180625_pearson_ms.md b/content/publication/20180625_pearson_ms.md index 3db3b86..7e80deb 100644 --- a/content/publication/20180625_pearson_ms.md +++ b/content/publication/20180625_pearson_ms.md @@ -1,60 +1,8 @@ +++ -title = "Heterogeneous Application and System Modeling" +title = "[thesis] Heterogeneous Application and System Modeling" date = 2018-06-25 draft = false - -# Authors. Comma separated list, e.g. `["Bob Smith", "David Jones"]`. -authors = ["Carl Pearson"] - -# Publication type. -# Legend: -# 0 = Uncategorized -# 1 = Conference paper -# 2 = Journal article -# 3 = Manuscript -# 4 = Report -# 5 = Book -# 6 = Book section -publication_types = ["3"] - -# Publication name and optional abbreviated version. -publication = "M.S. Thesis" -publication_short = "" - -# Abstract and optional shortened version. -abstract = "With the end of Dennard scaling, high-performance computing increasingly relies on heterogeneous systems with specialized hardware to improve application performance. This trend has driven up the complexity of high-performance software development, as developers must manage multiple programming systems and develop system-tuned code to utilize specialized hardware. In addition, it has exacerbated existing challenges of data placement as the specialized hardware often has local memories to fuel its computational demands. In addition to using appropriate software resources to target application computation at the best hardware for the job, application developers now must manage data movement and placement within their application, which also must be specifically tuned to the target system. Instead of relying on the application developer to have specialized knowledge of system characteristics and specialized expertise in multiple programming systems, this work proposes a heterogeneous system communication library that automatically chooses data location and data movement for high-performance application development and execution on heterogeneous systems. This work presents the foundational components of that library: a systematic approach for characterization of system communication links and application communication demands." -abstract_short = "" - -# Does this page contain LaTeX math? (true/false) -math = false - -# Does this page require source code highlighting? (true/false) -highlight = false - -# Featured image thumbnail (optional) -image_preview = "" - -# Is this a selected publication? (true/false) -selected = true - -# Projects (optional). -# Associate this publication with one or more of your projects. -# Simply enter your project's folder or file name without extension. -# E.g. `projects = ["deep-learning"]` references -# `content/project/deep-learning/index.md`. -# Otherwise, set `projects = []`. -projects = ["scope"] - -# Links (optional) -url_pdf = "pdf/20180625_pearson_ms.pdf" -url_preprint = "" -url_code = "" -url_dataset = "" -url_project = "" -url_slides = "" -url_video = "" -url_poster = "" -url_source = "" +tags = ["scope"] # Featured image # To use, add an image named `featured.jpg/png` to your page's folder. @@ -66,3 +14,12 @@ url_source = "" # Options: Smart, Center, TopLeft, Top, TopRight, Left, Right, BottomLeft, Bottom, BottomRight focal_point = "" +++ + + +**Carl Pearson** + +*M.S. Thesis* + +With the end of Dennard scaling, high-performance computing increasingly relies on heterogeneous systems with specialized hardware to improve application performance. This trend has driven up the complexity of high-performance software development, as developers must manage multiple programming systems and develop system-tuned code to utilize specialized hardware. In addition, it has exacerbated existing challenges of data placement as the specialized hardware often has local memories to fuel its computational demands. In addition to using appropriate software resources to target application computation at the best hardware for the job, application developers now must manage data movement and placement within their application, which also must be specifically tuned to the target system. Instead of relying on the application developer to have specialized knowledge of system characteristics and specialized expertise in multiple programming systems, this work proposes a heterogeneous system communication library that automatically chooses data location and data movement for high-performance application development and execution on heterogeneous systems. This work presents the foundational components of that library: a systematic approach for characterization of system communication links and application communication demands. + +* [pdf](/pdf/pdf/20180625_pearson_ms.pdf) \ No newline at end of file diff --git a/content/publication/20180628_pearson_iwoph.md b/content/publication/20180628_pearson_iwoph.md index 078f273..7334843 100644 --- a/content/publication/20180628_pearson_iwoph.md +++ b/content/publication/20180628_pearson_iwoph.md @@ -21,9 +21,7 @@ publication_types = ["1"] publication = "International Workshop on OpenPower in HPC" publication_short = "IWOPH 2018" -# Abstract and optional shortened version. -abstract = "High-performance computing increasingly relies on heterogeneous systems with specialized hardware accelerators to improve application performance. For example, NVIDIA’s CUDA programming system and general-purpose GPUs have emerged as a widespread accelerator in HPC systems. This trend has exacerbated challenges of data placement as accelerators often have fast local memories to fuel their computational demands, but slower interconnects to feed those memories. Crucially, real-world data-transfer performance is strongly influenced not just by the underlying hardware, but by the capabilities of the programming systems. Understanding how application performance is affected by the logical communication exposed through abstractions, as well as the underlying system topology, is crucial for developing high-performance applications and architectures. This report presents initial data-transfer microbenchmark results from two POWER-based systems obtained during work towards developing an automated system performance characterization tool." -abstract_short = "" + # Does this page contain LaTeX math? (true/false) math = false @@ -66,3 +64,5 @@ url_source = "" # Options: Smart, Center, TopLeft, Top, TopRight, Left, Right, BottomLeft, Bottom, BottomRight focal_point = "" +++ + +High-performance computing increasingly relies on heterogeneous systems with specialized hardware accelerators to improve application performance. For example, NVIDIA’s CUDA programming system and general-purpose GPUs have emerged as a widespread accelerator in HPC systems. This trend has exacerbated challenges of data placement as accelerators often have fast local memories to fuel their computational demands, but slower interconnects to feed those memories. Crucially, real-world data-transfer performance is strongly influenced not just by the underlying hardware, but by the capabilities of the programming systems. Understanding how application performance is affected by the logical communication exposed through abstractions, as well as the underlying system topology, is crucial for developing high-performance applications and architectures. This report presents initial data-transfer microbenchmark results from two POWER-based systems obtained during work towards developing an automated system performance characterization tool. \ No newline at end of file diff --git a/content/publication/20180919_pearson_arxiv.md b/content/publication/20180919_pearson_arxiv.md index 51fc834..2c11093 100644 --- a/content/publication/20180919_pearson_arxiv.md +++ b/content/publication/20180919_pearson_arxiv.md @@ -66,3 +66,5 @@ url_source = "" # Options: Smart, Center, TopLeft, Top, TopRight, Left, Right, BottomLeft, Bottom, BottomRight focal_point = "" +++ + +This report presents the design of the Scope infrastructure for extensible and portable benchmarking. Improvements in high-performance computing systems rely on coordination across different levels of system abstraction. Developing and defining accurate performance measurements is necessary at all levels of the system hierarchy, and should be as accessible as possible to developers with different backgrounds. The Scope project aims to lower the barrier to entry for developing performance benchmarks by providing a software architecture that allows benchmarks to be developed independently, by providing useful C/C++ abstractions and utilities, and by providing a Python package for generating publication-quality plots of resulting measurements. \ No newline at end of file diff --git a/content/publication/20180925_mailthody_hpec.md b/content/publication/20180925_mailthody_hpec.md index 5b4f76f..9b62704 100644 --- a/content/publication/20180925_mailthody_hpec.md +++ b/content/publication/20180925_mailthody_hpec.md @@ -21,10 +21,6 @@ publication_types = ["1"] publication = "In *2018 IEEE High Performance extreme Computing Conference*" publication_short = "In *HPEC*" -# Abstract and optional shortened version. -abstract = 'In this paper, we present an update to our previous submission from Graph Challenge 2017. This work describes and evaluates new software algorithm optimizations undertaken for our 2018 year submission on Collaborative CPU+GPU Algorithms for Triangle Counting and Truss Decomposition. First, we describe four major optimizations for the triangle counting which improved performance by up to 117x over our prior submission. Additionally, we show that our triangle-counting algorithm is on average 151.7x faster than NVIDIA’s NVGraph library (max 476x) for SNAP datasets. Second, we propose a novel parallel k-truss decomposition algorithm that is time-efficient and is up to 13.9x faster than our previous submission. Third, we evaluate the effect of generational hardware improvements between the IBM “Minsky” (POWER8, P100, NVLink 1.0) and “Newell” (POWER9, V100, NVLink 2.0) platforms. Lastly, the software optimizations presented in this work and the hardware improvements in the Newell platform enable analytics and discovery on large graphs with millions of nodes and billions of edges in less than a minute. In sum, the new algorithmic implementations are significantly faster and can handle much larger “big” graphs.' -abstract_short = "" - # Does this page contain LaTeX math? (true/false) math = false @@ -66,3 +62,5 @@ url_source = "" # Options: Smart, Center, TopLeft, Top, TopRight, Left, Right, BottomLeft, Bottom, BottomRight focal_point = "" +++ + +In this paper, we present an update to our previous submission from Graph Challenge 2017. This work describes and evaluates new software algorithm optimizations undertaken for our 2018 year submission on Collaborative CPU+GPU Algorithms for Triangle Counting and Truss Decomposition. First, we describe four major optimizations for the triangle counting which improved performance by up to 117x over our prior submission. Additionally, we show that our triangle-counting algorithm is on average 151.7x faster than NVIDIA’s NVGraph library (max 476x) for SNAP datasets. Second, we propose a novel parallel k-truss decomposition algorithm that is time-efficient and is up to 13.9x faster than our previous submission. Third, we evaluate the effect of generational hardware improvements between the IBM “Minsky” (POWER8, P100, NVLink 1.0) and “Newell” (POWER9, V100, NVLink 2.0) platforms. Lastly, the software optimizations presented in this work and the hardware improvements in the Newell platform enable analytics and discovery on large graphs with millions of nodes and billions of edges in less than a minute. In sum, the new algorithmic implementations are significantly faster and can handle much larger “big” graphs. \ No newline at end of file diff --git a/content/publication/20190410_pearson_icpe/index.md b/content/publication/20190410_pearson_icpe/index.md index 31b2cd3..3dfde25 100644 --- a/content/publication/20190410_pearson_icpe/index.md +++ b/content/publication/20190410_pearson_icpe/index.md @@ -1,80 +1,24 @@ +++ -title = "Evaluating Characteristics of CUDA Communication Primitives on High-Bandwidth Interconnects" -date = 2019-04-09T00:00:00 # Schedule page publish date. +title = "[ICPE'19] Evaluating Characteristics of CUDA Communication Primitives on High-Bandwidth Interconnects" +date = 2019-04-09 # Schedule page publish date. draft = false -# Authors. Comma separated list, e.g. `["Bob Smith", "David Jones"]`. -authors = ["Carl Pearson", "Adbul Dakkak", "Sarah Hashash", "Cheng Li", "I-Hsin Chung", "Jinjun Xiong", "Wen-Mei Hwu"] +tags = ["scope"] -# Publication type. -# Legend: -# 0 = Uncategorized -# 1 = Conference paper -# 2 = Journal article -# 3 = Manuscript -# 4 = Report -# 5 = Book -# 6 = Book section -publication_types = ["1"] ++++ -# Publication name and optional abbreviated version. -publication = "In *2019 ACM/SPEC International Conference on Performance Engineering*" -publication_short = "In *ICPE*" +**Carl Pearson, Adbul Dakkak, Sarah Hashash, Cheng Li, I-Hsin Chung, Jinjun Xiong, Wen-Mei Hwu** -# Abstract and optional shortened version. -abstract = """Data-intensive applications such as machine learning and analytics have created a demand for faster interconnects to avert the memory bandwidth wall and allow GPUs to be effectively leveraged for lower compute intensity tasks. This has resulted in wide adoption of heterogeneous systems with varying underlying interconnects, and has delegated the task of understanding and copying data to the system or application developer. No longer is a malloc followed by memcpy the only or dominating modality of data transfer; application developers are faced with additional options such as unified memory and zero-copy memory. Data transfer performance on these systems is now impacted by many factors including data transfer modality, system interconnect hardware details, CPU caching state, CPU power management state, driver policies, virtual memory paging efficiency, and data placement. +*2019 ACM/SPEC International Conference on Performance Engineering* + +Data-intensive applications such as machine learning and analytics have created a demand for faster interconnects to avert the memory bandwidth wall and allow GPUs to be effectively leveraged for lower compute intensity tasks. This has resulted in wide adoption of heterogeneous systems with varying underlying interconnects, and has delegated the task of understanding and copying data to the system or application developer. No longer is a malloc followed by memcpy the only or dominating modality of data transfer; application developers are faced with additional options such as unified memory and zero-copy memory. Data transfer performance on these systems is now impacted by many factors including data transfer modality, system interconnect hardware details, CPU caching state, CPU power management state, driver policies, virtual memory paging efficiency, and data placement. This paper presents Comm|Scope, a set of microbenchmarks designed for system and application developers to understand memory transfer behavior across different data placement and exchange scenarios. Comm|Scope comprehensively measures the latency and bandwidth of CUDA data transfer primitives, and avoids common pitfalls in ad-hoc measurements by controlling CPU caches, clock frequencies, and avoids measuring synchronization costs imposed by the measurement methodology where possible. This paper also presents an evaluation of Comm|Scope on systems featuring the POWER and x86 CPU architectures and PCIe 3, NVLink 1, and NVLink 2 interconnects. These systems are chosen as representative configurations of current high-performance GPU platforms. Comm|Scope measurements can serve to update insights about the relative performance of data transfer methods on current systems. -This work also reports insights for how high-level system design choices affect the performance of these data transfers, and how developers can optimize applications on these systems.""" -abstract_short = "This paper presents Comm|Scope, a set of microbenchmarks designed for system and application developers to understand memory transfer behavior across different data placement and exchange scenarios." - - -# Does this page contain LaTeX math? (true/false) -math = false - -# Does this page require source code highlighting? (true/false) -highlight = false - -# Featured image thumbnail (optional) -image_preview = "" - -# Is this a selected publication? (true/false) -selected = true - -# Projects (optional). -# Associate this publication with one or more of your projects. -# Simply enter your project's folder or file name without extension. -# E.g. `projects = ["deep-learning"]` references -# `content/project/deep-learning/index.md`. -# Otherwise, set `projects = []`. -projects = ["scope"] - -# Links (optional) -url_pdf = "pdf/20190410_pearson_icpe.pdf" -url_preprint = "" -url_code = "https://github.com/c3sr/comm_scope" -url_dataset = "" -url_project = "" -url_slides = "/pdf/20190410_pearson_icpe_slides.pdf" -url_video = "" -url_poster = "" -url_source = "" - -# Featured image -# To use, add an image named `featured.jpg/png` to your page's folder. -[image] - # Caption (optional) - caption = "" - - # Focal point (optional) - # Options: Smart, Center, TopLeft, Top, TopRight, Left, Right, BottomLeft, Bottom, BottomRight - focal_point = "" -+++ - +This work also reports insights for how high-level system design choices affect the performance of these data transfers, and how developers can optimize applications on these systems. * Best Paper for the ICPE research track papers * Functional ACM Artifact Evaluation. @@ -98,4 +42,8 @@ keywords = {numa, cuda, power, nvlink, x86, gpu, benchmarking}, location = {Mumbai, India}, series = {ICPE ’19} } -``` \ No newline at end of file +``` + +* [pdf](/pdf/20190410_pearson_icpe.pdf) +* [code](https://github.com/c3sr/comm_scope) +* [project](/project/scope) diff --git a/content/publication/20190926_almasri_hpec/index.md b/content/publication/20190926_almasri_hpec/index.md index 8db39cc..c35344c 100644 --- a/content/publication/20190926_almasri_hpec/index.md +++ b/content/publication/20190926_almasri_hpec/index.md @@ -21,16 +21,6 @@ publication_types = ["1"] publication = "2019 IEEE High Performance Extreme Computing Conference" publication_short = "In *HPEC'19*" -# Abstract and optional shortened version. -abstract = """ -In this paper, we present an update to our previous submission on k-truss decomposition from Graph Challenge 2018. -For single GPU k-truss implementation, we propose multiple algorithmic optimizations that significantly improve performance by up to 35.2x (6.9x on average) compared to our previous GPU implementation. In addition, we present a scalable multi-GPU implementation in which each GPU handles a different 'k' value. -Compared to our prior multi-GPU implementation,the proposed approach is faster by up to 151.3x (78.8x on average). In case when the edges with only maximal k-truss are sought, incrementing the 'k' value in each iteration is inefficient particularly for graphs with large maximum k-truss. -Thus, we propose binary search for the 'k' value to find the maximal k-truss. The binary search approach on a single GPU is up to 101.5 (24.3x on average) faster than our 2018 $k$-truss submission. -Lastly, we show that the proposed binary search finds the maximum k-truss for "Twitter" graph dataset having 2.8 billion bidirectional edges in just 16 minutes on a single V100 GPU. -""" -abstract_short = "" - # Does this page contain LaTeX math? (true/false) math = false @@ -73,3 +63,9 @@ url_source = "" # Options: Smart, Center, TopLeft, Top, TopRight, Left, Right, BottomLeft, Bottom, BottomRight focal_point = "" +++ + +In this paper, we present an update to our previous submission on k-truss decomposition from Graph Challenge 2018. +For single GPU k-truss implementation, we propose multiple algorithmic optimizations that significantly improve performance by up to 35.2x (6.9x on average) compared to our previous GPU implementation. In addition, we present a scalable multi-GPU implementation in which each GPU handles a different 'k' value. +Compared to our prior multi-GPU implementation,the proposed approach is faster by up to 151.3x (78.8x on average). In case when the edges with only maximal k-truss are sought, incrementing the 'k' value in each iteration is inefficient particularly for graphs with large maximum k-truss. +Thus, we propose binary search for the 'k' value to find the maximal k-truss. The binary search approach on a single GPU is up to 101.5 (24.3x on average) faster than our 2018 $k$-truss submission. +Lastly, we show that the proposed binary search finds the maximum k-truss for "Twitter" graph dataset having 2.8 billion bidirectional edges in just 16 minutes on a single V100 GPU. \ No newline at end of file diff --git a/content/publication/20190926_huang_hpec/index.md b/content/publication/20190926_huang_hpec/index.md index faac57a..1d5cba4 100644 --- a/content/publication/20190926_huang_hpec/index.md +++ b/content/publication/20190926_huang_hpec/index.md @@ -21,12 +21,6 @@ publication_types = ["1"] publication = "2019 IEEE High Performance Extreme Computing Conference" publication_short = "In *HPEC'19*" -# Abstract and optional shortened version. -abstract = """ -Deep neural networks (DNNs) have been widely adopted in many domains, including computer vision, natural language processing, and medical care. Recent research revealsthat sparsity in DNN parameters can be exploited to reduce inference computational complexity and improve network quality. However, sparsity also introduces irregularity and extra complexity in data processing, which make the accelerator design challenging. This work presents the design and implementation of a highly flexible sparse DNN inference accelerator on FPGA.Our proposed inference engine can be easily configured to beused in both mobile computing and high-performance computing scenarios. Evaluation shows our proposed inference engine effectively accelerates sparse DNNs and outperforms CPU solution by up to 4.7x in terms of energy efficiency. -""" -abstract_short = "" - # Does this page contain LaTeX math? (true/false) math = false @@ -69,3 +63,4 @@ url_source = "" # Options: Smart, Center, TopLeft, Top, TopRight, Left, Right, BottomLeft, Bottom, BottomRight focal_point = "" +++ +Deep neural networks (DNNs) have been widely adopted in many domains, including computer vision, natural language processing, and medical care. Recent research revealsthat sparsity in DNN parameters can be exploited to reduce inference computational complexity and improve network quality. However, sparsity also introduces irregularity and extra complexity in data processing, which make the accelerator design challenging. This work presents the design and implementation of a highly flexible sparse DNN inference accelerator on FPGA.Our proposed inference engine can be easily configured to beused in both mobile computing and high-performance computing scenarios. Evaluation shows our proposed inference engine effectively accelerates sparse DNNs and outperforms CPU solution by up to 4.7x in terms of energy efficiency. \ No newline at end of file diff --git a/content/publication/20190926_pearson_hpec/index.md b/content/publication/20190926_pearson_hpec/index.md index 7bf1788..94deea0 100644 --- a/content/publication/20190926_pearson_hpec/index.md +++ b/content/publication/20190926_pearson_hpec/index.md @@ -1,71 +1,19 @@ +++ -title = "Update on Triangle Counting on GPU" +title = "[HPEC] Update on Triangle Counting on GPU" date = 2019-08-22T00:00:00 # Schedule page publish date. draft = false -# Authors. Comma separated list, e.g. `["Bob Smith", "David Jones"]`. -authors = ["Carl Pearson", "Mohammad Almasri", "Omer Anjum", "Vikram S. Mailthody", "Zaid Qureshi", "Rakesh Nagi", "Jinjun Xiong", "Wen-Mei Hwu"] - -# Publication type. -# Legend: -# 0 = Uncategorized -# 1 = Conference paper -# 2 = Journal article -# 3 = Manuscript -# 4 = Report -# 5 = Book -# 6 = Book section -publication_types = ["1"] - -# Publication name and optional abbreviated version. -publication = "2019 IEEE High Performance Extreme Computing Conference" -publication_short = "In *HPEC'19*" - -# Abstract and optional shortened version. -abstract = """ -This work presents an update to the triangle-counting portion of the subgraph isomorphism static graph challenge. This work is motivated by a desire to understand the impact of CUDA unified memory on the triangle-counting problem. First, CUDA unified memory is used to overlap reading large graph data from disk with graph data structures in GPU memory. Second, we use CUDA unified memory hintsto solve multi-GPU performance scaling challenges present in our last submission. Finally, we improve the single-GPU kernel performance from our past submission by introducing a work-stealing dynamic algorithm GPU kernel with persistent threads, which makes performance adaptive for large graphs withoutrequiring a graph analysis phase. -""" -abstract_short = "" -# Does this page contain LaTeX math? (true/false) -math = false -# Does this page require source code highlighting? (true/false) -highlight = false - -# Featured image thumbnail (optional) -image_preview = "" - -# Is this a selected publication? (true/false) -selected = true - -# Projects (optional). -# Associate this publication with one or more of your projects. -# Simply enter your project's folder or file name without extension. -# E.g. `projects = ["deep-learning"]` references -# `content/project/deep-learning/index.md`. -# Otherwise, set `projects = []`. -projects = [] - -# Links (optional) -url_pdf = "pdf/2019_pearson_hpec.pdf" -url_preprint = "" -url_code = "" -url_dataset = "" -url_project = "" -url_slides = "" -url_video = "" -url_poster = "pdf/2019_pearson_hpec_poster.pdf" -url_source = "" - -# Featured image -# To use, add an image named `featured.jpg/png` to your page's folder. -[image] - # Caption (optional) - caption = "" - - # Focal point (optional) - # Options: Smart, Center, TopLeft, Top, TopRight, Left, Right, BottomLeft, Bottom, BottomRight - focal_point = "" +tags = ["applications"] +++ + +**Carl Pearson, Mohammad Almasri, Omer Anjum, Vikram S. Mailthody, Zaid Qureshi, Rakesh Nagi, Jinjun Xiong, Wen-Mei Hwu** + +In *2019 IEEE High Performance Extreme Computing Conference*. + +This work presents an update to the triangle-counting portion of the subgraph isomorphism static graph challenge. This work is motivated by a desire to understand the impact of CUDA unified memory on the triangle-counting problem. First, CUDA unified memory is used to overlap reading large graph data from disk with graph data structures in GPU memory. Second, we use CUDA unified memory hintsto solve multi-GPU performance scaling challenges present in our last submission. Finally, we improve the single-GPU kernel performance from our past submission by introducing a work-stealing dynamic algorithm GPU kernel with persistent threads, which makes performance adaptive for large graphs withoutrequiring a graph analysis phase. + +* [pdf](/pdf/2019_pearson_hpec.pdf) +* [poster](/pdf/2019_pearson_hpec_poster.pdf) \ No newline at end of file diff --git a/content/publication/20200522_pearson_iwapt/index.md b/content/publication/20200522_pearson_iwapt/index.md index 4f06d9e..d52196f 100644 --- a/content/publication/20200522_pearson_iwapt/index.md +++ b/content/publication/20200522_pearson_iwapt/index.md @@ -21,17 +21,6 @@ publication_types = ["1"] publication = "2020 IEEE International Workshop on Automatic Performance Tuning" publication_short = "In *iWAPT'20*" -# Abstract and optional shortened version. -abstract = """ -High-performance distributed computing systems increasingly feature nodes that have multiple CPU sockets and multiple GPUs. -The communication bandwidth between these components is non-uniform. -Furthermore, these systems can expose different communication capabilities between these components. -For communication-heavy applications, optimally using these capabilities is challenging and essential for performance. -Bespoke codes with optimized communication may be non-portable across run-time/software/hardware configurations, and existing stencil frameworks neglect optimized communication. -This work presents node-aware approaches for automatic data placement and communication implementation for 3D stencil codes on multi-GPU nodes with non-homogeneous communication performance and capabilities. -Benchmarking results in the Summit system show that choices in placement can result in a 20% improvement in single-node exchange, and communication specialization can yield a further 6x improvement in exchange time in a single node, and a 16% improvement at 1536 GPUs.""" -abstract_short = "" - # Does this page contain LaTeX math? (true/false) math = false @@ -74,3 +63,11 @@ url_source = "" # Options: Smart, Center, TopLeft, Top, TopRight, Left, Right, BottomLeft, Bottom, BottomRight focal_point = "" +++ + +High-performance distributed computing systems increasingly feature nodes that have multiple CPU sockets and multiple GPUs. +The communication bandwidth between these components is non-uniform. +Furthermore, these systems can expose different communication capabilities between these components. +For communication-heavy applications, optimally using these capabilities is challenging and essential for performance. +Bespoke codes with optimized communication may be non-portable across run-time/software/hardware configurations, and existing stencil frameworks neglect optimized communication. +This work presents node-aware approaches for automatic data placement and communication implementation for 3D stencil codes on multi-GPU nodes with non-homogeneous communication performance and capabilities. +Benchmarking results in the Summit system show that choices in placement can result in a 20% improvement in single-node exchange, and communication specialization can yield a further 6x improvement in exchange time in a single node, and a 16% improvement at 1536 GPUs. \ No newline at end of file diff --git a/content/publication/20200923_hidayetoglu_hpec/index.md b/content/publication/20200923_hidayetoglu_hpec/index.md index c223bed..478d28c 100644 --- a/content/publication/20200923_hidayetoglu_hpec/index.md +++ b/content/publication/20200923_hidayetoglu_hpec/index.md @@ -21,20 +21,6 @@ publication_types = ["1"] publication = "2020 IEEE High Performance Extreme Compute Conference" publication_short = "In *HPEC'20*" -# Abstract and optional shortened version. -abstract = """ -This paper presents GPU performance optimization and scaling results for inference models of the Sparse Deep Neural Network Challenge 2020. -Demands for network quality have increased rapidly, pushing the size and thus the memory requirements of many neural networks beyond the capacity ofavailable accelerators. -Sparse deep neural networks (SpDNN) have shown promise for reining in the memory footprint of large neural networks.\ -However, there is room for improvement inimplementing SpDNN operations on GPUs. -This work presents optimized sparse matrix multiplication kernels fused with theReLU function. -The optimized kernels reuse input feature mapsfrom the shared memory and sparse weights from registers. -For multi-GPU parallelism, our SpDNN implementation duplicates weights and statically partition the feature maps across GPUs. -Results for the challenge benchmarks show that the proposed kernel design and multi-GPU parallelization achieve up to 180TeraEdges per second inference throughput. -These results areup to 4.3x faster for a single GPU and an order of magnitude faster at full scale than those of the champion of the 2019 SparseDeep Neural Network Graph Challenge for the same generation of NVIDIA V100 GPUs. -Using the same implementation1, we also show single-GPU throughput on NVIDIA A100 is 2.37x fasterthan V100""" -abstract_short = "" - # Does this page contain LaTeX math? (true/false) math = false @@ -77,3 +63,14 @@ url_source = "" # Options: Smart, Center, TopLeft, Top, TopRight, Left, Right, BottomLeft, Bottom, BottomRight focal_point = "" +++ + +This paper presents GPU performance optimization and scaling results for inference models of the Sparse Deep Neural Network Challenge 2020. +Demands for network quality have increased rapidly, pushing the size and thus the memory requirements of many neural networks beyond the capacity ofavailable accelerators. +Sparse deep neural networks (SpDNN) have shown promise for reining in the memory footprint of large neural networks.\ +However, there is room for improvement inimplementing SpDNN operations on GPUs. +This work presents optimized sparse matrix multiplication kernels fused with theReLU function. +The optimized kernels reuse input feature mapsfrom the shared memory and sparse weights from registers. +For multi-GPU parallelism, our SpDNN implementation duplicates weights and statically partition the feature maps across GPUs. +Results for the challenge benchmarks show that the proposed kernel design and multi-GPU parallelization achieve up to 180TeraEdges per second inference throughput. +These results areup to 4.3x faster for a single GPU and an order of magnitude faster at full scale than those of the champion of the 2019 SparseDeep Neural Network Graph Challenge for the same generation of NVIDIA V100 GPUs. +Using the same implementation1, we also show single-GPU throughput on NVIDIA A100 is 2.37x fasterthan V100 \ No newline at end of file diff --git a/content/publication/20201229_pearson_arxiv/index.md b/content/publication/20201229_pearson_arxiv/index.md index c8bdaaa..54786c4 100644 --- a/content/publication/20201229_pearson_arxiv/index.md +++ b/content/publication/20201229_pearson_arxiv/index.md @@ -1,70 +1,20 @@ +++ -title = "Fast CUDA-Aware MPI Datatypes without Platform Support (preprint)" +title = "[preprint] Fast CUDA-Aware MPI Datatypes without Platform Support (preprint)" date = 2020-01-03T00:00:00 # Schedule page publish date. draft = false -# Authors. Comma separated list, e.g. `["Bob Smith", "David Jones"]`. -authors = ["Carl Pearson", "Kun Wu", "I-Hsin Chung", "Jinjun Xiong", "Wen-Mei Hwu"] - -# Publication type. -# Legend: -# 0 = Uncategorized -# 1 = Conference paper -# 2 = Journal article -# 3 = Manuscript -# 4 = Report -# 5 = Book -# 6 = Book section -publication_types = ["2"] - -# Publication name and optional abbreviated version. -publication = "Arxiv Preprint" -publication_short = "arxiv preprint" - -# Abstract and optional shortened version. -abstract = """ -MPI Derived Datatypes are an abstraction that simplifies handling of non-contiguous data in MPI applications. These datatypes are recursively constructed at runtime from primitive Named Types defined in the MPI standard. More recently, the development and deployment of CUDA-aware MPI implementations has encouraged the transition of distributed high-performance MPI codes to use GPUs. These implementations allow MPI functions to directly operate on GPU buffers, easing integration of GPU compute into MPI codes. Despite substantial attention to CUDA-aware MPI implementations, they continue to offer cripplingly poor GPU performance when manipulating derived datatypes on GPUs. This work presents an approach to integrating fast derived datatype handling into existing MPI deployments through an interposed library. This library can be used regardless of MPI deployment and without modifying application code. Furthermore, this work presents a performance model of GPU derived datatype handling, demonstrating that "one-shot" methods are not always fastest. Ultimately, the interposed-library model of this work demonstrates MPI_Pack speedup of up to 724,000 and MPI_Send speedup of up to 59,000x compared to the MPI implementation deployed on a leadership-class supercomputer. This yields speedup of more than 20,000x in a 3D halo exchange. """ -abstract_short = "" - # Does this page contain LaTeX math? (true/false) math = false -# Does this page require source code highlighting? (true/false) -highlight = false - -# Featured image thumbnail (optional) -image_preview = "" - -# Is this a selected publication? (true/false) -selected = true - -# Projects (optional). -# Associate this publication with one or more of your projects. -# Simply enter your project's folder or file name without extension. -# E.g. `projects = ["deep-learning"]` references -# `content/project/deep-learning/index.md`. -# Otherwise, set `projects = []`. -projects = [""] - -# Links (optional) -url_pdf = "" -url_preprint = "pdf/20201229_pearson_arxiv.pdf" -url_code = "https://github.com/cwpearson/tempi" -url_dataset = "" -url_project = "" -url_slides = "" -url_video = "" -url_poster = "" -url_source = "" - -# Featured image -# To use, add an image named `featured.jpg/png` to your page's folder. -[image] - # Caption (optional) - caption = "" - - # Focal point (optional) - # Options: Smart, Center, TopLeft, Top, TopRight, Left, Right, BottomLeft, Bottom, BottomRight - focal_point = "" +tags = ["stencil, mpi"] +++ + +**Carl Pearson, Kun Wu, I-Hsin Chung, Jinjun Xiong, Wen-Mei Hwu** + +*arxiv preprint* + +MPI Derived Datatypes are an abstraction that simplifies handling of non-contiguous data in MPI applications. These datatypes are recursively constructed at runtime from primitive Named Types defined in the MPI standard. More recently, the development and deployment of CUDA-aware MPI implementations has encouraged the transition of distributed high-performance MPI codes to use GPUs. These implementations allow MPI functions to directly operate on GPU buffers, easing integration of GPU compute into MPI codes. Despite substantial attention to CUDA-aware MPI implementations, they continue to offer cripplingly poor GPU performance when manipulating derived datatypes on GPUs. This work presents an approach to integrating fast derived datatype handling into existing MPI deployments through an interposed library. This library can be used regardless of MPI deployment and without modifying application code. Furthermore, this work presents a performance model of GPU derived datatype handling, demonstrating that "one-shot" methods are not always fastest. Ultimately, the interposed-library model of this work demonstrates MPI_Pack speedup of up to 724,000 and MPI_Send speedup of up to 59,000x compared to the MPI implementation deployed on a leadership-class supercomputer. This yields speedup of more than 20,000x in a 3D halo exchange. + +* [pdf](/pdf/20201229_pearson_arxiv.pdf) +* [code](https://github.com/cwpearson/tempi) \ No newline at end of file diff --git a/content/talk/20170910_hwu_bsc.md b/content/talk/20170910_hwu_bsc.md new file mode 100644 index 0000000..fee5d7e --- /dev/null +++ b/content/talk/20170910_hwu_bsc.md @@ -0,0 +1,12 @@ ++++ +title = "Innovative Applications and Technology Pivots - A Perfect Storm in Computing" +date = 2017-09-10 +draft = false + ++++ + +**Wen-mei Hwu, Izzat El Hajj, Simon Garcia de Gonzalo, Carl Pearson, Nam Sung Kim, Deming Chen, Jinjun Xiong, Zehra Sura** + +At *Barcelona Supercomputing Center*. + +* [pdf](/pdf/20170910_hwu_bsc.pdf) \ No newline at end of file diff --git a/themes/anatole b/themes/anatole index 6f80498..36d0d19 160000 --- a/themes/anatole +++ b/themes/anatole @@ -1 +1 @@ -Subproject commit 6f804985e4e301afc26e9d4cf26b8f3030d8887a +Subproject commit 36d0d1963bcebd750b5271910bc66712110124b5