-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
8 changed files
with
233 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
@inproceedings{chu_efficient_2023, | ||
abstract = {Sparse matrix-vector multiplication (SpMV) is a fundamental building block for various numerical computing applications. However, most existing GPU-SpMV approaches may suffer from either long preprocessing overhead, load imbalance, format conversion, bad memory access patterns. In this paper, we proposed two new SpMV algorithms:flat andline-enhance, as well as their implementations, for GPU systems to overcome the above shortcomings. Our algorithms work directly on the CSR sparse matrix format. To achieve high performance: 1) for load balance, theflat algorithm uses non-zero splitting andline-enhance uses a mix of row and non-zero splitting; 2) memory access patterns are designed for both algorithms for data loading, storing and reduction steps; and 3) an adaptive approach is proposed to select appropriate algorithm and parameters based on matrix characteristics. | ||
We evaluate our methods using theSuiteSparse Matrix Collection on AMD and NVIDIA GPU platforms. Average performance improvements of 424%, 741%, 49%, 46%, 72% are achieved when comparing our adaptive approach with CSR-Vector, CSR-Adaptive, HOLA, cuSparse and merge-based SpMV, respectively. In bandwidth tests, our approach can also achieve a high memory bandwidth, which is very close to the peak memory bandwidth.}, | ||
address = {Orlando, Florida}, | ||
author = {Chu, Genshen and He, Yuanjie and Dong, Lingyu and Ding, Zhezhao and Chen, Dandan and Bai, He and Wang, Xuesong and Hu, Changjun}, | ||
booktitle = {Proceedings of the 32nd International Symposium on High-Performance Parallel and Distributed Computing (HPDC '23), June 16–23, 2023, Orlando, FL, USA}, | ||
copyright = {All rights reserved}, | ||
doi = {10.1145/3588195.3593002}, | ||
isbn = {979-8-4007-0155-9/23/06}, | ||
language = {en}, | ||
note = {event-place: Orlando, FL, USA}, | ||
pages = {1--14}, | ||
publisher = {ACM Press}, | ||
series = {HPDC '23}, | ||
title = {Efficient Algorithm Design of Optimizing SpMV on GPU}, | ||
url = {http://doi.org/10.1145/3588195.3593002}, | ||
year = {2023} | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
--- | ||
# Documentation: https://wowchemy.com/docs/managing-content/ | ||
|
||
title: Efficient Algorithm Design of Optimizing SpMV on GPU | ||
subtitle: '' | ||
summary: '' | ||
authors: | ||
- Genshen Chu | ||
- Yuanjie He | ||
- Lingyu Dong | ||
- Zhezhao Ding | ||
- Dandan Chen | ||
- He Bai | ||
- Xuesong Wang | ||
- Changjun Hu | ||
tags: [] | ||
categories: [] | ||
date: '2023-01-01' | ||
lastmod: 2023-09-20T14:45:21+08:00 | ||
featured: false | ||
draft: false | ||
|
||
# Featured image | ||
# To use, add an image named `featured.jpg/png` to your page's folder. | ||
# Focal points: Smart, Center, TopLeft, Top, TopRight, Left, Right, BottomLeft, Bottom, BottomRight. | ||
image: | ||
caption: '' | ||
focal_point: '' | ||
preview_only: false | ||
|
||
# Projects (optional). | ||
# Associate this post with one or more of your projects. | ||
# Simply enter your project's folder or file name without extension. | ||
# E.g. `projects = ["internal-project"]` references `content/project/deep-learning/index.md`. | ||
# Otherwise, set `projects = []`. | ||
projects: [] | ||
publishDate: '2023-09-20T06:47:53.258484Z' | ||
publication_types: | ||
- '1' | ||
abstract: 'Sparse matrix-vector multiplication (SpMV) is a fundamental building block | ||
for various numerical computing applications. However, most existing GPU-SpMV approaches | ||
may suffer from either long preprocessing overhead, load imbalance, format conversion, | ||
bad memory access patterns. In this paper, we proposed two new SpMV algorithms:flat | ||
andline-enhance, as well as their implementations, for GPU systems to overcome the | ||
above shortcomings. Our algorithms work directly on the CSR sparse matrix format. | ||
To achieve high performance: 1) for load balance, theflat algorithm uses non-zero | ||
splitting andline-enhance uses a mix of row and non-zero splitting; 2) memory access | ||
patterns are designed for both algorithms for data loading, storing and reduction | ||
steps; and 3) an adaptive approach is proposed to select appropriate algorithm and | ||
parameters based on matrix characteristics. We evaluate our methods using theSuiteSparse | ||
Matrix Collection on AMD and NVIDIA GPU platforms. Average performance improvements | ||
of 424%, 741%, 49%, 46%, 72% are achieved when comparing our adaptive approach with | ||
CSR-Vector, CSR-Adaptive, HOLA, cuSparse and merge-based SpMV, respectively. In | ||
bandwidth tests, our approach can also achieve a high memory bandwidth, which is | ||
very close to the peak memory bandwidth.' | ||
publication: "*Proceedings of the 32nd International Symposium on High-Performance\ | ||
\ Parallel and Distributed Computing (HPDC '23), June 16–23, 2023, Orlando, FL,\ | ||
\ USA*" | ||
doi: 10.1145/3588195.3593002 | ||
links: | ||
- name: URL | ||
url: http://doi.org/10.1145/3588195.3593002 | ||
--- |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
@article{chu_md_2021, | ||
author = {Chu, Genshen and Li, Yang and Zhao, Runchu and Ren, Shuai and Yang, Wen and He, Xinfu and Hu, Changjun and Wang, Jue}, | ||
copyright = {All rights reserved}, | ||
doi = {10.1016/j.cpc.2021.108128}, | ||
issn = {00104655}, | ||
journal = {Computer Physics Communications}, | ||
language = {en}, | ||
month = {August}, | ||
number = {1}, | ||
pages = {108128}, | ||
title = {MD Simulation of Hundred-Billion-Metal-Atom Cascade Collision on Sunway Taihulight}, | ||
url = {https://linkinghub.elsevier.com/retrieve/pii/S001046552100240X}, | ||
urldate = {2021-08-08}, | ||
volume = {269}, | ||
year = {2021} | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
--- | ||
# Documentation: https://wowchemy.com/docs/managing-content/ | ||
|
||
title: MD Simulation of Hundred-Billion-Metal-Atom Cascade Collision on Sunway Taihulight | ||
subtitle: '' | ||
summary: '' | ||
authors: | ||
- Genshen Chu | ||
- Yang Li | ||
- Runchu Zhao | ||
- Shuai Ren | ||
- Wen Yang | ||
- Xinfu He | ||
- Changjun Hu | ||
- Jue Wang | ||
tags: [] | ||
categories: [] | ||
date: '2021-08-01' | ||
lastmod: 2023-01-04T22:24:34+08:00 | ||
featured: false | ||
draft: false | ||
|
||
# Featured image | ||
# To use, add an image named `featured.jpg/png` to your page's folder. | ||
# Focal points: Smart, Center, TopLeft, Top, TopRight, Left, Right, BottomLeft, Bottom, BottomRight. | ||
image: | ||
caption: '' | ||
focal_point: '' | ||
preview_only: false | ||
|
||
# Projects (optional). | ||
# Associate this post with one or more of your projects. | ||
# Simply enter your project's folder or file name without extension. | ||
# E.g. `projects = ["internal-project"]` references `content/project/deep-learning/index.md`. | ||
# Otherwise, set `projects = []`. | ||
projects: [] | ||
publishDate: '2023-01-04T14:29:12.088506Z' | ||
publication_types: | ||
- '2' | ||
abstract: '' | ||
publication: '*Computer Physics Communications*' | ||
doi: 10.1016/j.cpc.2021.108128 | ||
links: | ||
- name: URL | ||
url: https://linkinghub.elsevier.com/retrieve/pii/S001046552100240X | ||
--- |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
@article{hu_kernel_2017, | ||
abstract = {To optimize short-range force computations in Molecular Dynamics (MD) simulations, multi-threading and SIMD optimizations are presented in this paper. With respect to multi-threading optimization, a Partition-and-Separate-Calculation (PSC) method is designed to avoid write conflicts caused by using Newton’s third law. Serial bottlenecks are eliminated with no additional memory usage. The method is implemented by using the OpenMP model. Furthermore, the PSC method is employed on Intel Xeon Phi coprocessors in both native and offload models. We also evaluate the performance of the PSC method under different thread affinities on the MIC architecture. In the SIMD execution, we explain the performance influence in the PSC method, considering the ‘‘if-clause’’ of the cutoff radius check. The experiment results show that our PSC method is relatively more efficient compared to some traditional methods. In double precision, our 256-bit SIMD implementation is about 3 times faster than the scalar version.}, | ||
author = {Hu, Changjun and Wang, Xianmeng and Li, Jianjiang and He, Xinfu and Li, Shigang and Feng, Yangde and Yang, Shaofeng and Bai, He}, | ||
doi = {10.1016/j.cpc.2016.07.010}, | ||
issn = {00104655}, | ||
journal = {Computer Physics Communications}, | ||
language = {en}, | ||
month = {February}, | ||
pages = {31--40}, | ||
title = {Kernel optimization for short-range molecular dynamics}, | ||
url = {https://linkinghub.elsevier.com/retrieve/pii/S0010465516301928}, | ||
urldate = {2019-01-22}, | ||
volume = {211}, | ||
year = {2017} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
--- | ||
# Documentation: https://wowchemy.com/docs/managing-content/ | ||
|
||
title: Kernel optimization for short-range molecular dynamics | ||
subtitle: '' | ||
summary: '' | ||
authors: | ||
- Changjun Hu | ||
- Xianmeng Wang | ||
- Jianjiang Li | ||
- Xinfu He | ||
- Shigang Li | ||
- Yangde Feng | ||
- Shaofeng Yang | ||
- He Bai | ||
tags: [] | ||
categories: [] | ||
date: '2017-02-01' | ||
lastmod: 2023-01-04T22:29:03+08:00 | ||
featured: false | ||
draft: false | ||
|
||
# Featured image | ||
# To use, add an image named `featured.jpg/png` to your page's folder. | ||
# Focal points: Smart, Center, TopLeft, Top, TopRight, Left, Right, BottomLeft, Bottom, BottomRight. | ||
image: | ||
caption: '' | ||
focal_point: '' | ||
preview_only: false | ||
|
||
# Projects (optional). | ||
# Associate this post with one or more of your projects. | ||
# Simply enter your project's folder or file name without extension. | ||
# E.g. `projects = ["internal-project"]` references `content/project/deep-learning/index.md`. | ||
# Otherwise, set `projects = []`. | ||
projects: [] | ||
publishDate: '2023-01-04T14:29:11.845539Z' | ||
publication_types: | ||
- '2' | ||
abstract: To optimize short-range force computations in Molecular Dynamics (MD) simulations, | ||
multi-threading and SIMD optimizations are presented in this paper. With respect | ||
to multi-threading optimization, a Partition-and-Separate-Calculation (PSC) method | ||
is designed to avoid write conflicts caused by using Newton’s third law. Serial | ||
bottlenecks are eliminated with no additional memory usage. The method is implemented | ||
by using the OpenMP model. Furthermore, the PSC method is employed on Intel Xeon | ||
Phi coprocessors in both native and offload models. We also evaluate the performance | ||
of the PSC method under different thread affinities on the MIC architecture. In | ||
the SIMD execution, we explain the performance influence in the PSC method, considering | ||
the ‘‘if-clause’’ of the cutoff radius check. The experiment results show that our | ||
PSC method is relatively more efficient compared to some traditional methods. In | ||
double precision, our 256-bit SIMD implementation is about 3 times faster than the | ||
scalar version. | ||
publication: '*Computer Physics Communications*' | ||
doi: 10.1016/j.cpc.2016.07.010 | ||
links: | ||
- name: URL | ||
url: https://linkinghub.elsevier.com/retrieve/pii/S0010465516301928 | ||
--- |
598e64e
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Deploy preview for hpcde-github-io ready!
✅ Preview
https://hpcde-github-g5sk5gasv-genshen.vercel.app
Built with commit 598e64e.
This pull request is being automatically deployed with vercel-action