.

2017-07-18 17:22:39 -04:00
parent 74a913420d
commit 0096219bd2
5 changed files with 5 additions and 35 deletions
--- a/content/home/teaching.md
+++ b/content/home/teaching.md
@@ -14,7 +14,7 @@ weight = 60

 +++

-I am a University of Illinois [Mavis Future Faculty Fellow](https://publish.illinois.edu/engr-mavis/) for the 2017-2018 academic year.
+I am a University of Illinois [Mavis Future Faculty Fellow](http://publish.illinois.edu/engr-mavis/2017-2018-mavis-fellows/) for the 2017-2018 academic year.

 I have been a teaching assistant for the following courses:

--- a/content/publication/2014chen.md
+++ b/content/publication/2014chen.md
@@ -5,14 +5,7 @@ date = "2014-01-01"
 title = "Adaptive Cache Bypass and Insertion for Many-Core Accelerators"
 authors = ["Xuhao Chen", "Shengzhao Wu", "Li-Wen Chang", "Wei-Sheng Huang", "Carl Pearson", "Wen-mei Hwu"]

-abstract = 'Many-core accelerators, e.g. GPUs, are widely used for accelerating general-purpose compute kernels.
-With the SIMT execution model, GPUs can hide memory latency through massive multithreading for many regular applications.
-To support more applications with irregular memory access pattern, cache hierarchy is introduced to GPU architecture to capture input data sharing and mitigate the effect of irregular accesses.
-However, GPU caches suffer from poor efficiency due to severe contention, which makes it difficult to adopt heuristic management policies, and also limits system performance and energy-efficiency.
-We propose an adaptive cache management policy specifically for many-core accelerators.
-The tag array of L2 cache is enhanced with extra bits to track memory access history, an thus the locality information is captured and provided to L1 cache as heuristics to guide its run-time bypass and insertion decisions.
-By preventing un-reused data from polluting the cache and alleviating contention, cache efficiency is significantly improved.
-As a result, the system performance is improved by 31% on average for cache sensitive benchmarks, compared to the baseline GPU architecture.'
+abstract = 'Many-core accelerators, e.g. GPUs, are widely used for accelerating general-purpose compute kernels. With the SIMT execution model, GPUs can hide memory latency through massive multithreading for many regular applications. To support more applications with irregular memory access pattern, cache hierarchy is introduced to GPU architecture to capture input data sharing and mitigate the effect of irregular accesses. However, GPU caches suffer from poor efficiency due to severe contention, which makes it difficult to adopt heuristic management policies, and also limits system performance and energy-efficiency. We propose an adaptive cache management policy specifically for many-core accelerators. The tag array of L2 cache is enhanced with extra bits to track memory access history, an thus the locality information is captured and provided to L1 cache as heuristics to guide its run-time bypass and insertion decisions. By preventing un-reused data from polluting the cache and alleviating contention, cache efficiency is significantly improved. As a result, the system performance is improved by 31% on average for cache sensitive benchmarks, compared to the baseline GPU architecture.'

 image = ""
 image_preview = ""
--- a/content/publication/2016dakkak.md
+++ b/content/publication/2016dakkak.md
@@ -5,13 +5,7 @@ date = "2016-01-01"
 title = "WebGPU: A Scalable Online Development Platform for GPU Programming Courses"
 authors = ["Adbul Dakkak", "Carl Pearson", "Cheng Li"]

-abstract = 'The popularity of computer science classes offered through Massive Open On-line Courses (MOOCs) creates both opportunities and challenges.
-Programming-based classes need to provide consistent development infrastructures that are both scalable and user friendly to students.
-The \"Heterogeneous Parallel Programming\" class offered through Coursera teaches GPU programming and encountered these problems.
-We developed WebGPU - an online GPU development platform - providing students with a user friendly scalable GPU computing platform throughout the course.
-It has been used as the CUDA, OpenACC, and OpenCL programming environment for large Coursera courses, short-running summer schools, and traditional semester-long graduate and undergraduate courses.
-WebGPU has since replaced our traditional development infrastructure for the GPU classes offered at UIUC.
-This paper presents the original, revised, and upcoming WebGPU designs that address the requirements and challenges of offering sophisticated computing resources to a large, quickly-varying number of students.'
+abstract = 'The popularity of computer science classes offered through Massive Open On-line Courses (MOOCs) creates both opportunities and challenges. Programming-based classes need to provide consistent development infrastructures that are both scalable and user friendly to students. The \"Heterogeneous Parallel Programming\" class offered through Coursera teaches GPU programming and encountered these problems. We developed WebGPU - an online GPU development platform - providing students with a user friendly scalable GPU computing platform throughout the course. It has been used as the CUDA, OpenACC, and OpenCL programming environment for large Coursera courses, short-running summer schools, and traditional semester-long graduate and undergraduate courses. WebGPU has since replaced our traditional development infrastructure for the GPU classes offered at UIUC. This paper presents the original, revised, and upcoming WebGPU designs that address the requirements and challenges of offering sophisticated computing resources to a large, quickly-varying number of students.'

 image = ""
 image_preview = ""
--- a/content/publication/2017dakkak.md
+++ b/content/publication/2017dakkak.md
@@ -5,13 +5,7 @@ date = "2017-05-29"
 title = "RAI: A Scalable Project Submission System for Parallel Programming Courses"
 authors = ["Adbul Dakkak", "Carl Pearson", "Cheng Li"]

-abstract = 'A major component of many advanced programming courses is an open-ended “end-of-term project” assignment.
-Delivering and evaluating open-ended parallel programming projects for hundreds or thousands of students brings a
-need for broad system reconfigurability coupled with challenges of testing and development uniformity, access to
-esoteric hardware and programming environments, scalability, and security. We present RAI, a secure and extensible
-system for delivering open-ended programming assignments configured with access to different hardware and software
-requirements. We describe how the system was used to deliver a programming-competition-style final project in an introductory
-GPU programming course at the University of Illinois Urbana-Champaign.'
+abstract = 'A major component of many advanced programming courses is an open-ended “end-of-term project” assignment. Delivering and evaluating open-ended parallel programming projects for hundreds or thousands of students brings a need for broad system reconfigurability coupled with challenges of testing and development uniformity, access to esoteric hardware and programming environments, scalability, and security. We present RAI, a secure and extensible system for delivering open-ended programming assignments configured with access to different hardware and software requirements. We describe how the system was used to deliver a programming-competition-style final project in an introductory GPU programming course at the University of Illinois Urbana-Champaign.'

 image = ""
 image_preview = ""
--- a/content/publication/2017pearson.md
+++ b/content/publication/2017pearson.md
@@ -5,18 +5,7 @@ date = "2017-06-22"
 title = "Comparative Performance Evaluation of Multi-GPU MLFMM Implementation for 2-D VIE Problems"
 authors = ["Carl Pearson", "Mert Hidayetoglu", "Wei Ren", "Weng Cho Chew", "Wen-Mei Hwu"]

-abstract = 'We compare multi-GPU performance of the multilevel
-fast multipole method (MLFMM) on two different systems:
-A shared-memory IBM S822LC workstation with four NVIDIA
-P100 GPUs, and 16 XK nodes (each is employed with a
-single NVIDIA K20X GPU) of the Blue Waters supercomputer.
-MLFMM is implemented for solving scattering problems involving
-two-dimensional inhomogeneous bodies. Results show that the
-multi-GPU implementation provides 794 and 969 times speedups
-on the IBM and Blue Waters systems over their corresponding
-sequential CPU executions, respectively, where the sequential
-execution on the IBM system is 1.17 times faster than on the
-Blue Waters System.'
+abstract = 'We compare multi-GPU performance of the multilevel fast multipole method (MLFMM) on two different systems: A shared-memory IBM S822LC workstation with four NVIDIA P100 GPUs, and 16 XK nodes (each is employed with a single NVIDIA K20X GPU) of the Blue Waters supercomputer. MLFMM is implemented for solving scattering problems involving two-dimensional inhomogeneous bodies. Results show that the multi-GPU implementation provides 794 and 969 times speedups on the IBM and Blue Waters systems over their corresponding sequential CPU executions, respectively, where the sequential execution on the IBM system is 1.17 times faster than on the Blue Waters System.'

 image = ""
 image_preview = ""