Skip to content

Commit

Permalink
[SYSTEMDS-1780] Final resource optimizer for AWS EMR
Browse files Browse the repository at this point in the history
Closes #2135.
  • Loading branch information
lachezar-n authored and mboehm7 committed Nov 17, 2024
1 parent c929843 commit e326add
Show file tree
Hide file tree
Showing 56 changed files with 5,175 additions and 1,226 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -146,3 +146,7 @@ src/test/scripts/functions/pipelines/intermediates/classification/*

venv
venv/*

# resource optimization
scripts/resource/output
*.pem
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
<html lang="en">
<head>
<!-- Generated by javadoc -->
<title>Uses of Class org.apache.sysds.resource.CloudUtils.InstanceType (Apache SystemDS 3.3.0-SNAPSHOT API)</title>
<title>Uses of Class org.apache.sysds.resource.CloudUtils.InstanceFamily (Apache SystemDS 3.3.0-SNAPSHOT API)</title>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<link rel="stylesheet" type="text/css" href="../../../../../stylesheet.css" title="Style">
<link rel="stylesheet" type="text/css" href="../../../../../jquery/jquery-ui.min.css" title="Style">
Expand Down Expand Up @@ -94,7 +94,7 @@
</header>
<main role="main">
<div class="header">
<h2 title="Uses of Class org.apache.sysds.resource.CloudUtils.InstanceType" class="title">Uses of Class<br>org.apache.sysds.resource.CloudUtils.InstanceType</h2>
<h2 title="Uses of Class org.apache.sysds.resource.CloudUtils.InstanceType" class="title">Uses of Class<br>org.apache.sysds.resource.CloudUtils.InstanceFamily</h2>
</div>
<div class="classUseContainer">
<ul class="blockList">
Expand Down
28 changes: 27 additions & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@
<maven-shade-plugin.version>3.5.0</maven-shade-plugin.version>
<maven-compiler-plugin.version>3.11.0</maven-compiler-plugin.version>
<maven-antrun-plugin.version>3.1.0</maven-antrun-plugin.version>
<!-- aws-java-sdk-bundle version should align with hadoop-aws version -->
<!-- aws-java-sdk-bundle.version>1.12.367</aws-java-sdk-bundle.version -->
<!-- Set java compile level via argument, ex: 1.8 1.9 10 11-->
<java.level>11</java.level>
<java.version>{java.level}</java.version>
Expand Down Expand Up @@ -274,7 +276,7 @@
<manifest>
<addClasspath>true</addClasspath>
<classpathPrefix>lib/</classpathPrefix>
<mainClass>org.apache.sysds.api.ropt.Executor</mainClass>
<mainClass>org.apache.sysds.resource.ResourceOptimizer</mainClass>
</manifest>
<manifestEntries>
<Class-Path>SystemDS.jar ${project.artifactId}-${project.version}.jar</Class-Path>
Expand Down Expand Up @@ -413,6 +415,18 @@
<goal>run</goal>
</goals>
</execution>
<execution>
<id>rename-ropt-jar</id>
<phase>package</phase>
<configuration>
<target name="rename test JAR">
<copy file="${project.build.directory}/${project.artifactId}-${project.version}-ropt.jar" tofile="${project.build.directory}/ResourceOptimizer.jar" />
</target>
</configuration>
<goals>
<goal>run</goal>
</goals>
</execution>
</executions>
</plugin>

Expand Down Expand Up @@ -1337,6 +1351,18 @@
</exclusions>
</dependency>

<!--dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-aws</artifactId>
<version>${hadoop.version}</version>
</dependency-->

<!--dependency>
<groupId>com.amazonaws</groupId>
<artifactId>aws-java-sdk-bundle</artifactId>
<version>${aws-java-sdk-bundle.version}</version>
</dependency-->

<dependency>
<groupId>commons-logging</groupId>
<artifactId>commons-logging</artifactId>
Expand Down
169 changes: 169 additions & 0 deletions scripts/resource/README.md

Large diffs are not rendered by default.

31 changes: 31 additions & 0 deletions scripts/resource/aws_regional_prices.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
Region,Fee Ratio,EBS Price
af-south-1,0.195918367,0.1047
ap-east-1,0.181818182,0.1056
ap-northeast-1,0.193548387,0.096
ap-northeast-2,0.203389831,0.0912
ap-northeast-3,0.193548387,0.096
ap-south-1,0.237623762,0.0912
ap-south-2,0.237623762,0.0912
ap-southeast-1,0.2,0.096
ap-southeast-2,0.2,0.096
ap-southeast-3,0.2,0.096
ap-southeast-4,0.2,0.096
ap-southeast-5,0.235294118,0.0864
ca-central-1,0.224299065,0.088
ca-west-1,0.224299065,0.088
eu-central-1,0.208695652,0.0952
eu-central-2,0.18972332,0.1142
eu-north-1,0.235294118,0.0836
eu-south-1,0.214285714,0.0924
eu-south-2,0.224299065,0.088
eu-west-1,0.224299065,0.088
eu-west-2,0.216216216,0.0928
eu-west-3,0.214285714,0.0928
il-central-1,0.213333333,0.1056
me-central-1,0.204255319,0.0968
me-south-1,0.204255319,0.0968
sa-east-1,0.156862745,0.152
us-east-1,0.25,0.08
us-east-2,0.25,0.08
us-west-1,0.214285714,0.096
us-west-2,0.25,0.08
7 changes: 7 additions & 0 deletions scripts/resource/bin/systemds-ropt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/usr/bin/env bash

ROPT_JAR_FILE="${SYSTEMDS_ROOT}/target/ResourceOptimizer.jar"
DEFAULT_PROPERTIES="${SYSTEMDS_ROOT}/scripts/resource/options.properties"

java -jar "$ROPT_JAR_FILE" "$@" -options "$DEFAULT_PROPERTIES"

373 changes: 373 additions & 0 deletions scripts/resource/ec2_stats.csv

Large diffs are not rendered by default.

77 changes: 77 additions & 0 deletions scripts/resource/launch/cluster.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------

# Configurations for EMR launch

# User-defined configurations --------------------------------

# Program specific --------------------------------

# URI addres for the SystemDS jar file on S3
SYSTEMDS_JAR_URI=
# DML script path (use s3a:// URI schema for remote scripts in S3)
SYSTEMDS_PROGRAM=s3://systemds-testing/dml_scripts/Algorithm_L2SVM.dml
# Set the the file path arguments with adapted URI address
# for the actual file location and always s3a:// schema
# comma separated values
SYSTEMDS_ARGS=
# comma separated key=value pairs
SYSTEMDS_NVARGS=m=200000,n=10000
#Y=s3://systemds-testing/data/Y.csv,B=s3a://systemds-testing/data/B.csv

# AWS specific -------------------------

# Inspect the version difference before changing to version defferent form 7.3.0
EMR_VERSION="emr-7.3.0"
# output file of the resource optimization: hardware configurations
INSTANCE_CONFIGS=
# output file of the resource optimization: Spark configurations
SPARK_CONFIGS=
# existing SSH key (not created automatically)
KEYPAIR_NAME=
# Choose the same region as at executing resource optimizer
REGION=us-east-1
# Provide optionally a (signle) security group id to be added as additional to the master node
# If value empy the option won't be used and AWS won't attach an additional group and the SSH may be blocked
# Multiple additional groups are not supported by the launch script and this one is attached to the master only
SECURITY_GROUP_ID=
# Provide already created names
# or desired names for generation with 'generate_instance_profile.sh'
INSTANCE_PROFILE_NAME=
IAM_ROLE_NAME=
# Desired subnet to be used by the cluster, if not defined a default one will be used
TARGET_SUBNET=
# S3 folder URI for landing of log files
LOG_URI=

# Execution specific -------------------------

# (number) - if 0 the cluster will be terminated automatically after program execution
# - if greater than 0 the cluster will be terminated automatically after the given number of second in state idle
# - if less than 0 no automatic temrination rules will be applied
AUTO_TERMINATION_TIME=-1

# Automatic configurations (read only for users) -------------

# Current EMR Cluster ID
CLUSTER_ID=
# Public DNS name of the moster node in the current cluster
CLUSTER_URL=
86 changes: 86 additions & 0 deletions scripts/resource/launch/cluster_launch.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#!/usr/bin/env bash
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------

# exit in case of error or unbound var
set -euo pipefail

# get file directory to allow finding the file with the utils
SCRIPT_DIR="$(dirname "$(realpath "$0")")"

source cluster.env
source "$SCRIPT_DIR/cluster_utils.sh"

if [ -n "$TARGET_SUBNET" ]; then
SUBNET=$TARGET_SUBNET
else
#Get the first available subnet in the default VPC of the configured region
SUBNET=$(aws ec2 describe-subnets --region $REGION \
--filter "Name=defaultForAz,Values=true" --query "Subnets[0].SubnetId" --output text)
fi

# generate the step definition into STEP variable
generate_step_definition

echo -e "\nLaunching EMR cluster via AWS CLI and adding a step to run $SYSTEMDS_PROGRAM with SystemDS"
CLUSTER_INFO=$(aws emr create-cluster \
--applications Name=AmazonCloudWatchAgent Name=Spark \
--ec2-attributes '{
"KeyName":"'${KEYPAIR_NAME}'",
"InstanceProfile":"EMR_EC2_DefaultRole",
'"$( [ -n "$SECURITY_GROUP_ID'" ] && echo '"AdditionalMasterSecurityGroups": ["'${SECURITY_GROUP_ID}'"],' )"'
"SubnetId": "'${SUBNET}'"
}'\
--service-role EMR_DefaultRole \
--enable-debugging \
--release-label $EMR_VERSION \
--log-uri $LOG_URI \
--name "SystemDS cluster" \
--instance-groups file://$INSTANCE_CONFIGS \
--configurations file://$SPARK_CONFIGS \
--scale-down-behavior TERMINATE_AT_TASK_COMPLETION \
--no-termination-protected \
$( [ -n "$STEP" ] && echo "--steps $STEP" ) \
$( [ "$AUTO_TERMINATION_TIME" = 0 ] && echo "--auto-terminate" ) \
$( [ "$AUTO_TERMINATION_TIME" -gt 0 ] && echo "--auto-termination-policy IdleTimeout=$AUTO_TERMINATION_TIME" ) \
--region $REGION)

CLUSTER_ID=$(echo $CLUSTER_INFO | jq .ClusterId | tr -d '"')
echo "Cluster successfully initialized with cluster ID: "${CLUSTER_ID}
set_config "CLUSTER_ID" $CLUSTER_ID

# Wait for cluster to start
echo -e "\nWaiting for cluster to enter running state..."
aws emr wait cluster-running --cluster-id $CLUSTER_ID --region $REGION

CLUSTER_URL=$(aws emr describe-cluster --cluster-id $CLUSTER_ID --region $REGION | jq .Cluster.MasterPublicDnsName | tr -d '"')
set_config "CLUSTER_URL" "$CLUSTER_URL"

echo "...launching process has finished and the cluster is not in state running."

if [ "$AUTO_TERMINATION_TIME" = 0 ]; then
echo -e "\nImmediate automatic termination was enabled so the cluster will terminate directly after the step completion"
elif [ "$AUTO_TERMINATION_TIME" -gt 0 ]; then
echo -e "\nDelayed automatic termination was enabled so the cluster will terminate $AUTO_TERMINATION_TIME
seconds after entering idle state"
else
echo -e "\nAutomatic termination was not enabled so you should manually terminate the cluster"
fi
55 changes: 55 additions & 0 deletions scripts/resource/launch/cluster_run_script.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#!/usr/bin/env bash
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------

# exit in case of error or unbound var
set -euo pipefail

# get file directory to allow finding the file with the utils
SCRIPT_DIR="$(dirname "$(realpath "$0")")"

source cluster.env
source "$SCRIPT_DIR/cluster_utils.sh"

# generate the step definition into STEP variable
generate_step_definition
if [ $STEP -z ]; then
echo "Error: Empty state definition, probably due to empty SYSTEMDS_PROGRAM option."
exit 1
fi

echo "Adding a step to run $SYSTEMDS_PROGRAM with SystemDS"
STEP_INFO=$(aws emr add-steps --cluster-id $CLUSTER_ID --region $REGION --steps $STEP)

if [ "$AUTO_TERMINATION_TIME" = 0 ]; then
STEP_ID=$(echo $STEP_INFO | jq .StepIds | tr -d '"' | tr -d ']' | tr -d '[' | tr -d '[:space:]' )
echo "Waiting for the step to finish before termination (immediate automatic termination enabled)"
aws emr wait step-complete --cluster-id $CLUSTER_ID --step-id $STEP_ID --region $REGION
echo "The step has finished and now the cluster will before immediately terminated"
aws emr terminate-clusters --cluster-ids $CLUSTER_ID
elif [ "$AUTO_TERMINATION_TIME" -gt 0 ]; then
echo "Delayed automatic termination will apply only in case this option was set on cluster launch."
echo "You should manually track the step completion"
else
echo "Automatic termination was not enabled so you should manually track the step completion and terminate the cluster"
fi


Loading

0 comments on commit e326add

Please sign in to comment.