From 5bc49668de0facadfb4e08a3c413ee9028c461ed Mon Sep 17 00:00:00 2001 From: asumitamazon Date: Wed, 4 Oct 2023 14:30:17 -0700 Subject: [PATCH] Release Spark 3.4 Signed-off-by: asumitamazon --- Makefile | 4 +- Pipfile | 51 ++++--- Pipfile.lock | 2 +- cython_constraint.txt | 1 + new_images.yml | 4 +- pyproject.toml | 3 + setup.py | 48 ++++++ smsparkbuild/py39/Pipfile | 46 +++--- .../container-bootstrap-config/bootstrap.sh | 1 + .../3.4/py3/docker/py39/Dockerfile.cpu | 137 ++++++++++++++++++ .../3.4/py3/hadoop-config/core-site.xml | 31 ++++ .../3.4/py3/hadoop-config/hdfs-site.xml | 67 +++++++++ .../3.4/py3/hadoop-config/spark-defaults.conf | 10 ++ .../3.4/py3/hadoop-config/spark-env.sh | 3 + .../3.4/py3/hadoop-config/yarn-site.xml | 34 +++++ .../3.4/py3/nginx-config/default.conf | 17 +++ .../3.4/py3/nginx-config/nginx.conf | 66 +++++++++ spark/processing/3.4/py3/yum/emr-apps.repo | 17 +++ test/integration/conftest.py | 26 +--- 19 files changed, 493 insertions(+), 75 deletions(-) create mode 100644 cython_constraint.txt create mode 100644 pyproject.toml create mode 100644 setup.py create mode 100644 spark/processing/3.4/py3/container-bootstrap-config/bootstrap.sh create mode 100644 spark/processing/3.4/py3/docker/py39/Dockerfile.cpu create mode 100644 spark/processing/3.4/py3/hadoop-config/core-site.xml create mode 100644 spark/processing/3.4/py3/hadoop-config/hdfs-site.xml create mode 100644 spark/processing/3.4/py3/hadoop-config/spark-defaults.conf create mode 100644 spark/processing/3.4/py3/hadoop-config/spark-env.sh create mode 100644 spark/processing/3.4/py3/hadoop-config/yarn-site.xml create mode 100644 spark/processing/3.4/py3/nginx-config/default.conf create mode 100644 spark/processing/3.4/py3/nginx-config/nginx.conf create mode 100644 spark/processing/3.4/py3/yum/emr-apps.repo diff --git a/Makefile b/Makefile index 73ed9d7..678a4e0 100644 --- a/Makefile +++ b/Makefile @@ -7,7 +7,7 @@ SHELL := /bin/sh # Set variables if testing locally ifeq ($(IS_RELEASE_BUILD),) - SPARK_VERSION := 3.3 + SPARK_VERSION := 3.4 PROCESSOR := cpu FRAMEWORK_VERSION := py39 SM_VERSION := 1.0 @@ -51,7 +51,7 @@ install-container-library: init # temporarily bypass py=1.1.0 because pytest-parallel has a dependency on it however the module is no longer maitained. # In the future the pylib will be removed from pytest-parallel dependency and 51457 should only impact the local tests. # For more info, https://github.com/pytest-dev/py/issues/287 - pipenv run safety check -i 43975 -i 51457 # https://github.com/pyupio/safety + pipenv run safety check -i 43975 -i 51457 -i 39611 # https://github.com/pyupio/safety build-static-config: ./scripts/fetch-ec2-instance-type-info.sh --region ${REGION} --use-case ${USE_CASE} --spark-version ${SPARK_VERSION} \ diff --git a/Pipfile b/Pipfile index 08f9cfc..775d817 100644 --- a/Pipfile +++ b/Pipfile @@ -6,35 +6,38 @@ verify_ssl = true [dev-packages] [packages] -tenacity = "==8.0.1" -psutil = "==5.9.0" -click = "==8.1.2" -watchdog = "==0.10.3" +pyyaml = "==5.3.1" +tenacity = "==8.2.3" +psutil = "==5.9.5" +click = "==8.1.7" +watchdog = "==3.0.0" waitress = "==2.1.2" -types-waitress = "==2.0.6" -requests = "==2.27.1" -types-requests = "==2.27.16" -rsa = "==4.3" -pyasn1 = "==0.4.8" -boto3 = "==1.21.33" -safety = "==2.3.1" -black = "==22.3.0" -mypy = "==0.942" -flake8 = "==4.0.1" -flake8-docstrings = "==1.5.0" -pytest = "==7.1.1" -pytest-cov = "==2.10.0" -pytest-xdist = "==2.5.0" -docker = "==5.0.3" +types-waitress = "==2.1.4.9" +requests = "==2.31.0" +types-requests = "==2.31.0.2" +rsa = "==4.7" +pyasn1 = "==0.5.0" +boto3 = "==1.28.38" +safety = "==2.3.5" +black = "==22.12.0" +mypy = "==1.5.1" +flake8 = "==6.1.0" +flake8-docstrings = "==1.7.0" +pytest = "==7.4.0" +pytest-cov = "==4.1.0" +pytest-xdist = "==3.3.1" +docker = "==6.1.3" docker-compose = "==1.29.2" -cryptography = "==36.0.2" -typing-extensions = "==4.1.1" +cryptography = "==41.0.3" +typing-extensions = "==4.7.1" sagemaker = "==2.117.0" smspark = {editable = true, path = "."} -importlib-metadata = "==4.11.3" +importlib-metadata = "==4.13.0" pytest-parallel = "==0.1.1" -pytest-rerunfailures = "10.0" -numpy = "==1.22.2" +pytest-rerunfailures = "==12.0" +numpy = "==1.25.2" +py = "==1.11.0" +awscli = "==1.29.38" [requires] python_version = "3.9" diff --git a/Pipfile.lock b/Pipfile.lock index e50f52b..5c6d9aa 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1049,4 +1049,4 @@ } }, "develop": {} -} +} \ No newline at end of file diff --git a/cython_constraint.txt b/cython_constraint.txt new file mode 100644 index 0000000..d5e3183 --- /dev/null +++ b/cython_constraint.txt @@ -0,0 +1 @@ +Cython<3 diff --git a/new_images.yml b/new_images.yml index 3d7f115..6ba3e03 100644 --- a/new_images.yml +++ b/new_images.yml @@ -1,7 +1,7 @@ --- new_images: - - spark: "3.3" + - spark: "3.4" use-case: "processing" processors: ["cpu"] python: ["py39"] - sm_version: "1.2" + sm_version: "1.0" diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..f83c639 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,3 @@ +[tool.black] +line-length = 120 +target-version = ['py39'] diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..5802b8b --- /dev/null +++ b/setup.py @@ -0,0 +1,48 @@ +# Copyright 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +import glob +import os + +from setuptools import find_packages, setup + +with open("VERSION", "r") as version_file: + version = version_file.read() + +setup( + name="smspark", + description="Library that enables running Spark Processing jobs on Amazon SageMaker", + version=version, + python_requires=">3.7.0", + packages=find_packages("src"), + package_dir={"": "src"}, + py_modules=[os.path.splitext(os.path.basename(path))[0] for path in glob.glob("src/smspark/*.py")], + author="Amazon Web Services", + url="https://github.com/aws/smspark/", + license="Apache License 2.0", + keywords="ML Amazon AWS AI SageMaker Processing Spark", + classifiers=[ + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "Natural Language :: English", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python", + "Programming Language :: Python :: 3.9", + ], + setup_requires=["setuptools", "wheel"], + entry_points={ + "console_scripts": [ + "smspark-submit=smspark.cli:submit_main", + "smspark-history-server=smspark.history_server_cli:run_history_server", + ] + }, +) diff --git a/smsparkbuild/py39/Pipfile b/smsparkbuild/py39/Pipfile index fdd8f94..775d817 100644 --- a/smsparkbuild/py39/Pipfile +++ b/smsparkbuild/py39/Pipfile @@ -6,36 +6,38 @@ verify_ssl = true [dev-packages] [packages] -tenacity = "==8.0.1" -psutil = "==5.9.0" -click = "==8.1.2" -watchdog = "==0.10.3" +pyyaml = "==5.3.1" +tenacity = "==8.2.3" +psutil = "==5.9.5" +click = "==8.1.7" +watchdog = "==3.0.0" waitress = "==2.1.2" -types-waitress = "==2.0.6" +types-waitress = "==2.1.4.9" requests = "==2.31.0" -types-requests = "==2.27.16" -rsa = "==4.9" -pyasn1 = "==0.4.8" -boto3 = "==1.21.33" +types-requests = "==2.31.0.2" +rsa = "==4.7" +pyasn1 = "==0.5.0" +boto3 = "==1.28.38" safety = "==2.3.5" -black = "==22.3.0" -mypy = "==0.942" -flake8 = "==4.0.1" -flake8-docstrings = "==1.5.0" -pytest = "==7.2.2" -pytest-cov = "==2.10.0" -pytest-xdist = "==3.2.1" -docker = "==5.0.3" +black = "==22.12.0" +mypy = "==1.5.1" +flake8 = "==6.1.0" +flake8-docstrings = "==1.7.0" +pytest = "==7.4.0" +pytest-cov = "==4.1.0" +pytest-xdist = "==3.3.1" +docker = "==6.1.3" docker-compose = "==1.29.2" -cryptography = "==39.0.2" -typing-extensions = "==4.1.1" +cryptography = "==41.0.3" +typing-extensions = "==4.7.1" sagemaker = "==2.117.0" smspark = {editable = true, path = "."} -importlib-metadata = "==4.11.3" +importlib-metadata = "==4.13.0" pytest-parallel = "==0.1.1" -pytest-rerunfailures = "10.0" -numpy = "==1.22.2" +pytest-rerunfailures = "==12.0" +numpy = "==1.25.2" py = "==1.11.0" +awscli = "==1.29.38" [requires] python_version = "3.9" diff --git a/spark/processing/3.4/py3/container-bootstrap-config/bootstrap.sh b/spark/processing/3.4/py3/container-bootstrap-config/bootstrap.sh new file mode 100644 index 0000000..e4e23bb --- /dev/null +++ b/spark/processing/3.4/py3/container-bootstrap-config/bootstrap.sh @@ -0,0 +1 @@ +echo "Not implemented" \ No newline at end of file diff --git a/spark/processing/3.4/py3/docker/py39/Dockerfile.cpu b/spark/processing/3.4/py3/docker/py39/Dockerfile.cpu new file mode 100644 index 0000000..860cf48 --- /dev/null +++ b/spark/processing/3.4/py3/docker/py39/Dockerfile.cpu @@ -0,0 +1,137 @@ +FROM 137112412989.dkr.ecr.us-west-2.amazonaws.com/amazonlinux:2 +ARG REGION +ENV AWS_REGION ${REGION} + +ENV JAVA_HOME /etc/alternatives/jre + +RUN yum clean all \ + && yum update -y \ + && yum install -y awscli bigtop-utils curl gcc gzip unzip zip gunzip tar wget liblapack* libblas* libopencv* libopenblas* + +# Taken from EMR https://tiny.amazon.com/1dp4p55nm/codeamazpackAwsCblob8b00src +RUN amazon-linux-extras enable corretto8 nginx1 \ +&& yum install -y java-1.8.0-amazon-corretto-devel nginx python-virtualenv \ +&& yum remove -y java-1.8.0-openjdk-headless + +# Install python 3.9 +ARG PYTHON_BASE_VERSION=3.9 +ARG PYTHON_WITH_BASE_VERSION=python${PYTHON_BASE_VERSION} +ARG PIP_WITH_BASE_VERSION=pip${PYTHON_BASE_VERSION} +ARG PYTHON_VERSION=${PYTHON_BASE_VERSION}.12 +RUN yum -y groupinstall 'Development Tools' \ + && yum -y install openssl-devel bzip2-devel libffi-devel sqlite-devel xz-devel \ + && wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz \ + && tar xzf Python-${PYTHON_VERSION}.tgz \ + && cd Python-*/ \ + && ./configure --enable-optimizations \ + && make altinstall \ + && echo -e 'alias python3=python3.9\nalias pip3=pip3.9' >> ~/.bashrc \ + && ln -s $(which ${PYTHON_WITH_BASE_VERSION}) /usr/local/bin/python3 \ + && ln -s $(which ${PIP_WITH_BASE_VERSION}) /usr/local/bin/pip3 \ + && cd .. \ + && rm Python-${PYTHON_VERSION}.tgz \ + && rm -rf Python-${PYTHON_VERSION} + +# install nginx amazonlinux:2.0.20200304.0 does not have nginx, so need to install epel-release first +RUN wget https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm +RUN yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm +RUN yum install -y nginx + +RUN rm -rf /var/cache/yum + +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +# http://blog.stuart.axelbrooke.com/python-3-on-spark-return-of-the-pythonhashseed +ENV PYTHONHASHSEED 0 +ENV PYTHONIOENCODING UTF-8 +ENV PIP_DISABLE_PIP_VERSION_CHECK 1 + +# Install EMR Spark/Hadoop +ENV HADOOP_HOME /usr/lib/hadoop +ENV HADOOP_CONF_DIR /usr/lib/hadoop/etc/hadoop +ENV SPARK_HOME /usr/lib/spark + +COPY yum/emr-apps.repo /etc/yum.repos.d/emr-apps.repo + +# Install hadoop / spark dependencies from EMR's yum repository for Spark optimizations. +# replace placeholder with region in repository URL +RUN sed -i "s/REGION/${AWS_REGION}/g" /etc/yum.repos.d/emr-apps.repo +RUN adduser -N hadoop + +# These packages are a subset of what EMR installs in a cluster with the +# "hadoop", "spark", and "hive" applications. +# They include EMR-optimized libraries and extras. +RUN yum install -y aws-hm-client \ + aws-java-sdk \ + aws-sagemaker-spark-sdk \ + emr-goodies \ + emr-ruby \ + emr-scripts \ + emr-s3-select \ + emrfs \ + hadoop \ + hadoop-client \ + hadoop-hdfs \ + hadoop-hdfs-datanode \ + hadoop-hdfs-namenode \ + hadoop-httpfs \ + hadoop-kms \ + hadoop-lzo \ + hadoop-yarn \ + hadoop-yarn-nodemanager \ + hadoop-yarn-proxyserver \ + hadoop-yarn-resourcemanager \ + hadoop-yarn-timelineserver \ + hive \ + hive-hcatalog \ + hive-hcatalog-server \ + hive-jdbc \ + hive-server2 \ + s3-dist-cp \ + spark-core \ + spark-datanucleus \ + spark-external \ + spark-history-server \ + spark-python + +# Point Spark at proper python binary +ENV PYSPARK_PYTHON=/usr/local/bin/python3.9 + +# Setup Spark/Yarn/HDFS user as root +ENV PATH="/usr/bin:/opt/program:${PATH}" +ENV YARN_RESOURCEMANAGER_USER="root" +ENV YARN_NODEMANAGER_USER="root" +ENV HDFS_NAMENODE_USER="root" +ENV HDFS_DATANODE_USER="root" +ENV HDFS_SECONDARYNAMENODE_USER="root" + +# Set up bootstrapping program and Spark configuration +COPY hadoop-config /opt/hadoop-config +COPY nginx-config /opt/nginx-config +COPY aws-config /opt/aws-config +COPY Pipfile Pipfile.lock setup.py *.whl /opt/program/ +ENV PIPENV_PIPFILE=/opt/program/Pipfile +# Use --system flag, so it will install all packages into the system python, +# and not into the virtualenv. Since docker containers do not need to have virtualenvs +# pipenv > 2022.4.8 fails to build smspark +RUN /usr/local/bin/python3.9 -m pip install pipenv==2022.4.8 \ + && pipenv install --system \ + && /usr/local/bin/python3.9 -m pip install /opt/program/*.whl + +# Setup container bootstrapper +COPY container-bootstrap-config /opt/container-bootstrap-config +RUN chmod +x /opt/container-bootstrap-config/bootstrap.sh \ + && /opt/container-bootstrap-config/bootstrap.sh + +# With this config, spark history server will not run as daemon, otherwise there +# will be no server running and container will terminate immediately +ENV SPARK_NO_DAEMONIZE TRUE + +WORKDIR $SPARK_HOME + +# Install the sagemaker feature store spark connector +# https://docs.aws.amazon.com/sagemaker/latest/dg/batch-ingestion-spark-connector-setup.html +# Feature store connector library currently does not support spark 3.4 so commenting out this line +# RUN /usr/local/bin/python3.9 -m pip install sagemaker-feature-store-pyspark-3.3==1.1.2 --no-binary :all: + +ENTRYPOINT ["smspark-submit"] diff --git a/spark/processing/3.4/py3/hadoop-config/core-site.xml b/spark/processing/3.4/py3/hadoop-config/core-site.xml new file mode 100644 index 0000000..2b032ff --- /dev/null +++ b/spark/processing/3.4/py3/hadoop-config/core-site.xml @@ -0,0 +1,31 @@ + + + + + + + fs.defaultFS + hdfs://nn_uri/ + NameNode URI + + + fs.s3a.aws.credentials.provider + com.amazonaws.auth.DefaultAWSCredentialsProviderChain + AWS S3 credential provider + + + fs.s3.impl + org.apache.hadoop.fs.s3a.S3AFileSystem + s3a filesystem implementation + + + fs.AbstractFileSystem.s3a.imp + org.apache.hadoop.fs.s3a.S3A + s3a filesystem implementation + + + fs.s3a.connection.maximum + 100 + s3a filesystem maximum connection + + diff --git a/spark/processing/3.4/py3/hadoop-config/hdfs-site.xml b/spark/processing/3.4/py3/hadoop-config/hdfs-site.xml new file mode 100644 index 0000000..37e0a5b --- /dev/null +++ b/spark/processing/3.4/py3/hadoop-config/hdfs-site.xml @@ -0,0 +1,67 @@ + + + + + + + dfs.datanode.data.dir + file:///opt/amazon/hadoop/hdfs/datanode + Comma separated list of paths on the local filesystem of a DataNode where it should store its\ + blocks. + + + + dfs.namenode.name.dir + file:///opt/amazon/hadoop/hdfs/namenode + Path on the local filesystem where the NameNode stores the namespace and transaction logs per\ + sistently. + + + + + dfs.client.block.write.replace-datanode-on-failure.enable + true + + If there is a datanode/network failure in the write pipeline, + DFSClient will try to remove the failed datanode from the pipeline + and then continue writing with the remaining datanodes. As a result, + the number of datanodes in the pipeline is decreased. The feature is + to add new datanodes to the pipeline. + + This is a site-wide property to enable/disable the feature. + + When the cluster size is extremely small, e.g. 3 nodes or less, cluster + administrators may want to set the policy to NEVER in the default + configuration file or disable this feature. Otherwise, users may + experience an unusually high rate of pipeline failures since it is + impossible to find new datanodes for replacement. + + See also dfs.client.block.write.replace-datanode-on-failure.policy + + + + + dfs.client.block.write.replace-datanode-on-failure.policy + ALWAYS + + This property is used only if the value of + dfs.client.block.write.replace-datanode-on-failure.enable is true. + + ALWAYS: always add a new datanode when an existing datanode is + removed. + + NEVER: never add a new datanode. + + DEFAULT: + Let r be the replication number. + Let n be the number of existing datanodes. + Add a new datanode only if r is greater than or equal to 3 and either + (1) floor(r/2) is greater than or equal to n; or + (2) r is greater than n and the block is hflushed/appended. + + + diff --git a/spark/processing/3.4/py3/hadoop-config/spark-defaults.conf b/spark/processing/3.4/py3/hadoop-config/spark-defaults.conf new file mode 100644 index 0000000..734086a --- /dev/null +++ b/spark/processing/3.4/py3/hadoop-config/spark-defaults.conf @@ -0,0 +1,10 @@ +spark.driver.extraClassPath /usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar +spark.driver.extraLibraryPath /usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native +spark.executor.extraClassPath /usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar +spark.executor.extraLibraryPath /usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native +spark.driver.host=sd_host +spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version=2 + +# Fix for "Uncaught exception: org.apache.spark.rpc.RpcTimeoutException: Cannot +# receive any reply from 10.0.109.30:35219 in 120 seconds."" +spark.rpc.askTimeout=300s diff --git a/spark/processing/3.4/py3/hadoop-config/spark-env.sh b/spark/processing/3.4/py3/hadoop-config/spark-env.sh new file mode 100644 index 0000000..1b58aa1 --- /dev/null +++ b/spark/processing/3.4/py3/hadoop-config/spark-env.sh @@ -0,0 +1,3 @@ +#EMPTY FILE AVOID OVERRIDDING ENV VARS +# Specifically, without copying the empty file, SPARK_HISTORY_OPTS will be overriden, +# spark.history.ui.port defaults to 18082, and spark.eventLog.dir defaults to local fs diff --git a/spark/processing/3.4/py3/hadoop-config/yarn-site.xml b/spark/processing/3.4/py3/hadoop-config/yarn-site.xml new file mode 100644 index 0000000..1c92988 --- /dev/null +++ b/spark/processing/3.4/py3/hadoop-config/yarn-site.xml @@ -0,0 +1,34 @@ + + + + + yarn.resourcemanager.hostname + rm_hostname + The hostname of the RM. + + + yarn.nodemanager.hostname + nm_hostname + The hostname of the NM. + + + yarn.nodemanager.webapp.address + nm_webapp_address + + + yarn.nodemanager.vmem-pmem-ratio + 5 + Ratio between virtual memory to physical memory. + + + yarn.resourcemanager.am.max-attempts + 1 + The maximum number of application attempts. + + + yarn.nodemanager.env-whitelist + JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,YARN_HOME,AWS_CONTAINER_CREDENTIALS_RELATIVE_URI,AWS_REGION + Environment variable whitelist + + + diff --git a/spark/processing/3.4/py3/nginx-config/default.conf b/spark/processing/3.4/py3/nginx-config/default.conf new file mode 100644 index 0000000..a8a50a5 --- /dev/null +++ b/spark/processing/3.4/py3/nginx-config/default.conf @@ -0,0 +1,17 @@ +server { + listen 15050; + server_name localhost; + client_header_buffer_size 128k; + large_client_header_buffers 4 128k; + + location ~ ^/history/(.*)/(.*)/jobs/$ { + proxy_pass http://localhost:18080/history/$1/jobs/; + proxy_redirect http://localhost:18080/history/$1/jobs/ $domain_name/proxy/15050/history/$1/jobs/; + expires off; + } + + location / { + proxy_pass http://localhost:18080; + expires off; + } +} \ No newline at end of file diff --git a/spark/processing/3.4/py3/nginx-config/nginx.conf b/spark/processing/3.4/py3/nginx-config/nginx.conf new file mode 100644 index 0000000..1e3a51c --- /dev/null +++ b/spark/processing/3.4/py3/nginx-config/nginx.conf @@ -0,0 +1,66 @@ +# For more information on configuration, see: +# * Official English Documentation: http://nginx.org/en/docs/ +# * Official Russian Documentation: http://nginx.org/ru/docs/ + +user nginx; +worker_processes auto; +error_log /var/log/nginx/error.log; +pid /run/nginx.pid; + +# Load dynamic modules. See /usr/share/doc/nginx/README.dynamic. +include /usr/share/nginx/modules/*.conf; + +events { + worker_connections 1024; +} + +http { + log_format main '$remote_addr - $remote_user [$time_local] "$request" ' + '$status $body_bytes_sent "$http_referer" ' + '"$http_user_agent" "$http_x_forwarded_for"'; + + access_log /var/log/nginx/access.log main; + + sendfile on; + tcp_nopush on; + tcp_nodelay on; + keepalive_timeout 65; + types_hash_max_size 2048; + + include /etc/nginx/mime.types; + default_type application/octet-stream; + + # Load modular configuration files from the /etc/nginx/conf.d directory. + # See http://nginx.org/en/docs/ngx_core_module.html#include + # for more information. + include /etc/nginx/conf.d/*.conf; + + server { + listen 80 default_server; + listen [::]:80 default_server; + server_name _; + root /usr/share/nginx/html; + + # Load configuration files for the default server block. + include /etc/nginx/default.d/*.conf; + + location /proxy/15050 { + proxy_pass http://localhost:15050/; + } + + location ~ ^/proxy/15050/(.*) { + proxy_pass http://localhost:15050/$1; + } + + location / { + } + + error_page 404 /404.html; + location = /40x.html { + } + + error_page 500 502 503 504 /50x.html; + location = /50x.html { + } + } +} \ No newline at end of file diff --git a/spark/processing/3.4/py3/yum/emr-apps.repo b/spark/processing/3.4/py3/yum/emr-apps.repo new file mode 100644 index 0000000..2d361d6 --- /dev/null +++ b/spark/processing/3.4/py3/yum/emr-apps.repo @@ -0,0 +1,17 @@ +[emr-apps] +name = EMR Application Repository +gpgkey = https://s3-REGION.amazonaws.com/repo.REGION.emr.amazonaws.com/apps-repository/emr-6.12.0/cfe221e4-88c4-464c-ad10-42f53049e5e7/repoPublicKey.txt +enabled = 1 +baseurl = https://s3-REGION.amazonaws.com/repo.REGION.emr.amazonaws.com/apps-repository/emr-6.12.0/cfe221e4-88c4-464c-ad10-42f53049e5e7 +priority = 5 +gpgcheck = 0 + +[emr-puppet] +mirrorlist: http://amazonlinux.$awsregion.$awsdomain/$releasever/extras/emr-puppet/latest/$basearch/mirror.list +enabled: 1 +gpgcheck: 1 +name: Amazon Extras repo for emr-puppet +gpgkey: file:///etc/pki/rpm-gpg/RPM-GPG-KEY-amazon-linux-2 +priority: 10 +skip_if_unavailable: 1 +report_instanceid: yes diff --git a/test/integration/conftest.py b/test/integration/conftest.py index 96d328f..528863f 100644 --- a/test/integration/conftest.py +++ b/test/integration/conftest.py @@ -121,31 +121,9 @@ def sagemaker_session(boto_session, sagemaker_client) -> Session: @pytest.fixture(scope="session") def is_feature_store_available(region) -> bool: - """Check if feature store is available in current region.""" + """Check if feature store is available in current region. Setting list empty since FS library not available for + spark releases later than 3.3. Otherwise it will fail tests during release.""" feature_store_released_regions = [ - "us-east-1", - "us-west-2", - "us-east-2", - "us-west-1", - "ap-northeast-1", - "ap-northeast-2", - "ap-northeast-3", - "ap-east-1", - "ap-south-1", - "ap-southeast-1", - "ap-southeast-2", - "me-south-1", - "sa-east-1", - "ca-central-1", - "eu-central-1", - "eu-north-1", - "eu-west-1", - "eu-west-2", - "eu-west-3", - "af-south-1", - "eu-south-1", - "cn-northwest-1", - "cn-north-1", ] return region in feature_store_released_regions