Skip to content

Commit

Permalink
chore: use a smaller spark base image (#103)
Browse files Browse the repository at this point in the history
  • Loading branch information
chgl authored Sep 1, 2023
1 parent 84df54e commit 862e124
Show file tree
Hide file tree
Showing 6 changed files with 29 additions and 14 deletions.
7 changes: 7 additions & 0 deletions docker-compose/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,13 @@ Open <http://localhost:8084/> to view the cluster's topics.
docker compose -f compose.decompose-xmls.yaml up
```

## Convert the FHIR resources to a CSV dataset

```sh
sudo chown -R 1001:1001 ./opal-output/
docker compose -f compose.adtfhir-to-opal.yaml up
```

## Start the entire stack

```sh
Expand Down
6 changes: 3 additions & 3 deletions docker-compose/compose.adtfhir-to-opal.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ services:
ipc: private
security_opt:
- "no-new-privileges:true"
user: "1000:100"
user: "1001:1001"
environment:
OUTPUT_FOLDER: "/home/jovyan/opal-output"
OUTPUT_FOLDER: "/opt/bitnami/spark/opal-output"
KAFKA_TOPIC_YEAR_SUFFIX: "" # e.g. ".2023"
KAFKA_BOOTSTRAP_SERVER: "kafka:9092"
KAFKA_PATIENT_TOPIC: "fhir.onkoadt.Patient"
Expand All @@ -19,4 +19,4 @@ services:
KAFKA_PROCEDURE_TOPIC: "fhir.onkoadt.Procedure"
KAFKA_MEDICATIONSTATEMENT_TOPIC: "fhir.onkoadt.MedicationStatement"
volumes:
- ${PWD}/opal-output:/home/jovyan/opal-output
- ${PWD}/opal-output:/opt/bitnami/spark/opal-output
18 changes: 11 additions & 7 deletions src/adtfhir_to_opal/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
FROM docker.io/jupyter/pyspark-notebook:spark-3.3.2@sha256:86f23b36bbd1900e10ce15bb29cf55ce31b10b1406c5afa6e57acf529cb10093
WORKDIR /home/jovyan
USER 1000:100
FROM docker.io/bitnami/spark:3.3.2@sha256:11ccd03367cadc0da48432e7636746e98a842324f590630f6d14299a40ff2ee4
ENV SPARK_JARS_IVY="/home/spark/.ivy"
WORKDIR /opt/bitnami/spark
USER 0
RUN groupadd -g 1001 spark && \
useradd spark -u 1001 -g spark -m -s /bin/bash

COPY requirements.txt requirements.txt

RUN <<EOF
pip install --no-cache-dir -r requirements.txt
spark-shell -v --packages "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.2,au.csiro.pathling:library-api:6.2.1,ch.cern.sparkmeasure:spark-measure_2.13:0.21,io.delta:delta-core_2.12:2.3.0"
EOF
RUN pip install --no-cache-dir -r requirements.txt

USER 1001:1001
RUN spark-shell -v --conf spark.jars.ivy=${SPARK_JARS_IVY}\
--packages "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.2,au.csiro.pathling:library-api:6.2.1,ch.cern.sparkmeasure:spark-measure_2.13:0.21,io.delta:delta-core_2.12:2.3.0"

COPY adtfhir_to_opal.py adtfhir_to_opal.py

Expand Down
5 changes: 4 additions & 1 deletion src/adtfhir_to_opal/adtfhir_to_opal.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ class Settings(BaseSettings):
kafka_observation_topic: str = "fhir.onkoadt.Observation"
kafka_procedure_topic: str = "fhir.onkoadt.Procedure"
kafka_medicationstatement_topic: str = "fhir.onkoadt.MedicationStatement"
# ⚠️ make sure these are consistent with the ones downloaded inside the Dockerfile
jar_list: list = [
"org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.2",
"au.csiro.pathling:library-api:6.2.1",
Expand All @@ -35,6 +36,8 @@ class Settings(BaseSettings):
spark_driver_memory: str = "8g"
spark_executor_cores: str = "4"

spark_jars_ivy: str = "/home/spark/.ivy2"


settings = Settings()

Expand All @@ -51,7 +54,7 @@ def setup_spark_session(appName: str, master: str):
.config("spark.executor.cores", settings.spark_executor_cores)
.config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
.config("spark.jars.packages", ",".join(settings.jar_list))
.config("spark.jars.ivy", "/home/jovyan/.ivy2")
.config("spark.jars.ivy", settings.spark_jars_ivy)
.config(
"spark.sql.catalog.spark_catalog",
"org.apache.spark.sql.delta.catalog.DeltaCatalog",
Expand Down
6 changes: 3 additions & 3 deletions src/adtfhir_to_opal/compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@ services:
context: .
dockerfile: Dockerfile
environment:
OUTPUT_FOLDER: "/home/jovyan/opal-output"
KAFKA_TOPIC_YEAR_SUFFIX: ".2023"
OUTPUT_FOLDER: "/opt/bitnami/spark/opal-output"
KAFKA_TOPIC_YEAR_SUFFIX: ""
KAFKA_BOOTSTRAP_SERVER: "kafka:9092"
KAFKA_PATIENT_TOPIC: "fhir.onkoadt.Patient"
KAFKA_CONDITION_TOPIC: "fhir.onkoadt.Condition"
KAFKA_OBSERVATION_TOPIC: "fhir.onkoadt.Observation"
KAFKA_PROCEDURE_TOPIC: "fhir.onkoadt.Procedure"
KAFKA_MEDICATIONSTATEMENT_TOPIC: "fhir.onkoadt.MedicationStatement"
volumes:
- ${PWD}/output:/home/jovyan/opal-output
- ${PWD}/output:/opt/bitnami/spark/opal-output
1 change: 1 addition & 0 deletions src/adtfhir_to_opal/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
pathling==6.2.1
pydantic==1.10.9
pyspark==3.3.2
pandas==2.1.0

0 comments on commit 862e124

Please sign in to comment.