Skip to content

tpcds

tpcds #1071

Workflow file for this run

name: tpcds
on:
push:
branches:
- 'main'
paths:
- '**/tpcds.yml'
pull_request:
branches:
- 'main'
paths:
- '**/tpcds.yml'
schedule:
- cron: '30 20 * * *'
workflow_dispatch:
inputs:
debug:
type: boolean
description: "Run the build with tmate debugging enabled"
required: false
default: false
jobs:
tpcds:
runs-on: ubuntu-20.04
services:
redis:
image: redis
options: >-
--health-cmd "redis-cli ping"
--health-interval 10s
--health-timeout 5s
--health-retries 5
ports:
- 6379:6379
steps:
- run: sudo swapoff -a
- name: Set up Java
uses: actions/setup-java@v3
with:
distribution: 'temurin'
java-version: '8'
- name: Checkout
uses: actions/checkout@v3
with:
fetch-depth: 1
- name: Build
timeout-minutes: 10
uses: ./.github/actions/build
- name: Cache local Maven repository
uses: actions/cache@v3
with:
path: ~/.m2/repository
key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}
restore-keys: |
${{ runner.os }}-maven-
- name: Build Hadoop SDK
run: make -C sdk/java
- name: Juicefs Format
run: |
sudo ./juicefs format redis://127.0.0.1:6379/1 dev --trash-days 0 --bucket /tmp/jfs
sudo chmod -R 777 /tmp/jfs/dev/
- name: Set up Spark
working-directory: /tmp
run: |
curl https://dlcdn.apache.org/spark/ > a.html
spark_version=$(grep -oP '(?<=href=")spark-[^/]+(?=/")' a.html | grep -v preview | tail -1)
echo "spark_version is $spark_version"
wget -q https://dlcdn.apache.org/spark/$spark_version/$spark_version-bin-hadoop3.tgz
tar -zxf $spark_version-bin-hadoop3.tgz
ln -s $spark_version-bin-hadoop3 spark
cp ~/work/juicefs/juicefs/sdk/java/target/juicefs-hadoop*jar /tmp/spark/jars
cp ~/work/juicefs/juicefs/.github/workflows/resources/core-site.xml /tmp/spark/conf
export PATH=$PATH:/tmp/spark/bin:/tmp/spark/sbin
echo /tmp/spark/bin >> $GITHUB_PATH
echo /tmp/spark/sbin >> $GITHUB_PATH
start-master.sh -i localhost
start-slave.sh spark://localhost:7077
- name: Set up tpcds-kit
working-directory: /tmp
run: |
echo workspace is ${{ github.workspace }}
sudo ${{ github.workspace }}/.github/scripts/apt_install.sh gcc make flex bison byacc git
git clone --depth 1 https://github.com/databricks/tpcds-kit.git
cd tpcds-kit/tools
make OS=LINUX
- name: Set up spark-sql-perf
working-directory: /tmp
run: |
sudo ${{ github.workspace }}/.github/scripts/apt_install.sh apt-transport-https curl gnupg sbt
echo "deb https://repo.scala-sbt.org/scalasbt/debian all main" | sudo tee /etc/apt/sources.list.d/sbt.list
echo "deb https://repo.scala-sbt.org/scalasbt/debian /" | sudo tee /etc/apt/sources.list.d/sbt_old.list
curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0x2EE0EA64E40A89B84B2DF73499E82A75642AC823" | sudo -H gpg --no-default-keyring --keyring gnupg-ring:/etc/apt/trusted.gpg.d/scalasbt-release.gpg --import
sudo chmod 644 /etc/apt/trusted.gpg.d/scalasbt-release.gpg
mkdir ~/.sbt
cat > ~/.sbt/repositories <<EOF
[repositories]
local
maven-central: https://repo1.maven.org/maven2/
typesafe-releases: https://repo.typesafe.com/typesafe/ivy-releases/, [organization]/[module]/(scala_[scalaVersion]/)(sbt_[sbtVersion]/)[revision]/[type]s/[artifact](-[classifier]).[ext]
EOF
git clone --depth 1 https://github.com/databricks/spark-sql-perf.git
cd spark-sql-perf
sbt package
- name: Gen data
timeout-minutes: 35
working-directory: /tmp
run: |
spark-shell \
--jars /tmp/spark-sql-perf/target/scala-2.12/spark-sql-perf*.jar \
--master spark://localhost:7077 \
--deploy-mode client \
--executor-memory 2G \
--driver-memory 2G \
--executor-cores 1 \
--conf spark.sql.shuffle.partitions=10 \
-i ~/work/juicefs/juicefs/.github/workflows/resources/tpcds_datagen.scala \
|| spark-shell \
--jars /tmp/spark-sql-perf/target/scala-2.12/spark-sql-perf*.jar \
--master spark://localhost:7077 \
--deploy-mode client \
--executor-memory 2G \
--driver-memory 2G \
--executor-cores 1 \
--conf spark.sql.shuffle.partitions=10 \
-i ~/work/juicefs/juicefs/.github/workflows/resources/tpcds_datagen.scala
- name: Run tpcds
timeout-minutes: 30
working-directory: /tmp
run: |
spark-shell \
--jars /tmp/spark-sql-perf/target/scala-2.12/spark-sql-perf*.jar \
--master spark://localhost:7077 \
--deploy-mode client \
--executor-memory 2G \
--driver-memory 2G \
--executor-cores 1 \
--conf spark.sql.shuffle.partitions=10 \
-i ~/work/juicefs/juicefs/.github/workflows/resources/tpcds_run.scala \
|| spark-shell \
--jars /tmp/spark-sql-perf/target/scala-2.12/spark-sql-perf*.jar \
--master spark://localhost:7077 \
--deploy-mode client \
--executor-memory 2G \
--driver-memory 2G \
--executor-cores 1 \
--conf spark.sql.shuffle.partitions=10 \
-i ~/work/juicefs/juicefs/.github/workflows/resources/tpcds_run.scala
- name: Log
if: always()
run: |
if [ -f /var/log/juicefs.log ]; then
echo "juicefs log"
sudo tail -n 1000 /var/log/juicefs.log
grep "<FATAL>:" /var/log/juicefs.log && exit 1 || true
fi
- name: Send Slack Notification
if: failure()
uses: juicedata/slack-notify-action@main
with:
channel-id: "${{ secrets.SLACK_CHANNEL_ID_FOR_PR_CHECK_NOTIFY }}"
slack_bot_token: "${{ secrets.SLACK_BOT_TOKEN }}"
- name: Setup upterm session
if: failure() && (github.event.inputs.debug == 'true' || github.run_attempt != 1)
timeout-minutes: 60
uses: lhotari/action-upterm@v1