ORNL · renan-souza · Dec 4, 2024 · Dec 2, 2024 · Dec 2, 2024 · Dec 3, 2024
diff --git a/.github/workflows/create-release-n-publish.yml b/.github/workflows/create-release-n-publish.yml
@@ -12,6 +12,8 @@ jobs:
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
+        with:
+          fetch-depth: 1
 
       - name: Set up Python 3.10
         uses: actions/setup-python@v5
@@ -21,11 +23,13 @@ jobs:
       - name: Get branch names
         id: branch-name
         uses: tj-actions/branch-names@v6
+
       - name: Update version.py
         run: |
           export PYTHONPATH=$PYTHONPATH:flowcept
           export BRANCH_NAME="${{ steps.branch-name.outputs.current_branch }}"
           python .github/workflows/version_bumper.py
+
       - name: Commit new version
         run: |
           git config --global user.name 'Flowcept CI Bot'
@@ -70,6 +74,7 @@ jobs:
           pip install
           build
           --user
+
       - name: Build a binary wheel and a source tarball
         run: >-
           python -m
@@ -78,6 +83,7 @@ jobs:
           --wheel
           --outdir dist/
           .
+
       - name: Publish distribution to Test PyPI
         uses: pypa/gh-action-pypi-publish@release/v1
         with:
@@ -91,29 +97,50 @@ jobs:
         with:
           password: ${{ secrets.PYPI_API_TOKEN }}
           verbose: true
+
       - name: Wait pypi do its thing
         run: sleep 120
+
       - name: Test pip install
         run: pip install flowcept
+
       - name: Print installed version
         run: pip list | grep flowcept
+
       - name: Test pip install one adapter
         run: pip install flowcept[dask]
+
       - name: Test pip install multiple adapters
         run: pip install flowcept[mlflow,tensorboard]
+
       - name: Install our dependencies
         run: pip install flowcept[all]
+
       - name: Install ml_dev dependencies
         run: pip install flowcept[ml_dev]
+
       - name: Pip list
-        run: pip list      
-      - name: Run Docker Compose
-        run: docker compose -f deployment/compose.yml up -d
+        run: pip list
+
+      - name: Start up services
+        run: make services
+
       - name: Test with pytest
         run: pytest --ignore=tests/decorator_tests/ml_tests/llm_tests
+
       - name: Test notebooks
         run: |
           # export FLOWCEPT_SETTINGS_PATH=~/.flowcept/settings.yaml
           python src/flowcept/flowcept_webserver/app.py &
           sleep 3
           pytest --nbmake "notebooks/" --nbmake-timeout=600 --ignore=notebooks/dask_from_CLI.ipynb
+
+      - name: Stop services
+        run: make services-stop
+
+      - name: Clean up
+        run: |
+          make clean
+          find /home/runner/runners/ -type f -name "*.log" -exec sh -c 'echo {}; >"{}"' \; || true
+          docker image prune -a -f
+
diff --git a/.github/workflows/run-checks.yml b/.github/workflows/run-checks.yml
@@ -11,6 +11,8 @@ jobs:
 
     steps:
       - uses: actions/checkout@v4
+        with:
+          fetch-depth: 1
 
       - name: Set up Python 3.10
         uses: actions/setup-python@v5
@@ -29,3 +31,8 @@ jobs:
 
       - name: Run HTML builder for Sphinx documentation
         run: make docs
+
+      - name: Clean up
+        run: |
+          make clean
+          find /home/runner/runners/ -type f -name "*.log" -exec sh -c 'echo {}; >"{}"' \; || true
diff --git a/.github/workflows/run-tests-in-container.yml b/.github/workflows/run-tests-in-container.yml
@@ -10,6 +10,8 @@ jobs:
 
     steps:
       - uses: actions/checkout@v4
+        with:
+          fetch-depth: 1
 
       - name: Show OS Info
         run: '[[ "$OSTYPE" == "linux-gnu"* ]] && { echo "OS Type: Linux"; (command -v lsb_release &> /dev/null && lsb_release -a) || cat /etc/os-release; uname -r; } || [[ "$OSTYPE" == "darwin"* ]] && { echo "OS Type: macOS"; sw_vers; uname -r; } || echo "Unsupported OS type: $OSTYPE"'
@@ -22,3 +24,12 @@ jobs:
 
       - name: Run tests in container
         run: make tests-in-container
+
+      - name: Stop services
+        run: make services-stop
+
+      - name: Clean up
+        run: |
+          make clean
+          find /home/runner/runners/ -type f -name "*.log" -exec sh -c 'echo {}; >"{}"' \; || true
+          docker image prune -a -f
diff --git a/.github/workflows/run-tests-kafka.yml b/.github/workflows/run-tests-kafka.yml
@@ -13,6 +13,8 @@ jobs:
 
     steps:
       - uses: actions/checkout@v4
+        with:
+          fetch-depth: 1
 
       - name: Set up Python 3.10
         uses: actions/setup-python@v5
@@ -48,11 +50,11 @@ jobs:
       - name: Test notebooks
         run: pytest --ignore=notebooks/zambeze.ipynb --nbmake "notebooks/" --nbmake-timeout=600 --ignore=notebooks/dask_from_CLI.ipynb
 
-#      - name: Test notebooks
-#        run: |
-#          pip install -e .[all] # Installing stuff again may not be needed
-#          export MQ_TYPE=kafka
-#          export MQ_PORT=9092
-#          python -c 'from flowcept.configs import MQ_TYPE, MQ_PORT; print(f"MQ_TYPE={MQ_TYPE}"); print(f"MQ_PORT={MQ_PORT}")'
-#          python -c 'from flowcept import Flowcept; assert Flowcept.services_alive()'
-#          pytest --ignore=notebooks/zambeze.ipynb --nbmake "notebooks/" --nbmake-timeout=600 --ignore=notebooks/dask_from_CLI.ipynb
+      - name: Stop services
+        run: docker compose -f deployment/compose-kafka.yml down
+
+      - name: Clean up
+        run: |
+          make clean
+          find /home/runner/runners/ -type f -name "*.log" -exec sh -c 'echo {}; >"{}"' \; || true
+          docker image prune -a -f
diff --git a/.github/workflows/run-tests-py11.yml b/.github/workflows/run-tests-py11.yml
@@ -10,6 +10,8 @@ jobs:
 
     steps:
       - uses: actions/checkout@v4
+        with:
+          fetch-depth: 1
 
       - name: Set up Python 3.11
         uses: actions/setup-python@v5
@@ -68,9 +70,14 @@ jobs:
       - name: Test notebooks with pytest and redis
         run: make tests-notebooks
 
-      - name: Shut down docker compose
+      - name: Stop services
         run: make services-stop
 
+      - name: Clean up
+        run: |
+          make clean
+          find /home/runner/runners/ -type f -name "*.log" -exec sh -c 'echo {}; >"{}"' \; || true
+
       - name: Start docker compose with kafka
         run: docker compose -f deployment/compose-kafka.yml up -d
 
@@ -90,3 +97,12 @@ jobs:
           export MQ_PORT=9092
           # Ignoring heavy tests. They are executed with Kafka in another GH Action.
           pytest --ignore=tests/decorator_tests/ml_tests --ignore=tests/adapters/test_tensorboard.py
+
+      - name: Stop services
+        run: docker compose -f deployment/compose-kafka.yml down
+
+      - name: Clean up
+        run: |
+          make clean
+          find /home/runner/runners/ -type f -name "*.log" -exec sh -c 'echo {}; >"{}"' \; || true
+          docker image prune -a -f
diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml
@@ -13,6 +13,8 @@ jobs:
 
     steps:
       - uses: actions/checkout@v4
+        with:
+          fetch-depth: 1
 
       - name: Set up Python 3.10
         uses: actions/setup-python@v5
@@ -83,6 +85,11 @@ jobs:
       - name: Shut down docker compose
         run: make services-stop
 
+      - name: Clean up
+        run: |
+          make clean
+          find /home/runner/runners/ -type f -name "*.log" -exec sh -c 'echo {}; >"{}"' \;
+
       - name: Start docker compose with kafka
         run: docker compose -f deployment/compose-kafka.yml up -d
 
@@ -101,3 +108,15 @@ jobs:
           export MQ_TYPE=kafka
           export MQ_PORT=9092
           make tests
+
+      - name: Stop docker compose with kafka
+        run: docker compose -f deployment/compose-kafka.yml down
+
+      - name: Clean up
+        run: |
+          make clean
+          find /home/runner/runners/ -type f -name "*.log" -exec sh -c 'echo {}; >"{}"' \; || true
+          docker image prune -a -f
+
+      - name: List large files
+        run: find . -type f -exec du -h {} + | sort -h
diff --git a/Makefile b/Makefile
@@ -26,18 +26,20 @@ reformat:
 
 # Remove cache directories and Sphinx build output
 clean:
-	rm -rf .ruff_cache
-	rm -rf .pytest_cache
-	rm -rf mnist_data
-	rm -rf tensorboard_events
-	rm -f docs_dump_tasks_*
-	rm -f dump_test.json
-	find . -type f -name "*.log" -exec rm -f {} \;
-	find . -type f -name "*.pth" -exec rm -f {} \;
-	find . -type f -name "mlflow.db" -exec rm -f {} \;
-	find . -type d -name "mlruns" -exec rm -rf {} \;
-	find . -type d -name "__pycache__" -exec rm -rf {} \;  2>/dev/null
-	sphinx-build -M clean docs docs/_build
+	rm -rf .ruff_cache || true
+	rm -rf .pytest_cache || true
+	rm -rf mnist_data || true
+	rm -rf tensorboard_events || true
+	rm -f docs_dump_tasks_* || true
+	rm -f dump_test.json || true
+	find . -type f -name "*.log" -exec rm -f {} \; || true
+	find . -type f -name "*.pth" -exec rm -f {} \; || true
+	find . -type f -name "mlflow.db" -exec rm -f {} \; || true
+	find . -type d -name "mlruns" -exec rm -rf {} \; 2>/dev/null || true
+	find . -type d -name "mlruns" -exec rm -rf {} \; 2>/dev/null || true
+	find . -type d -name "__pycache__" -exec rm -rf {} \;  2>/dev/null || true
+	find . -type d -name "*tfevents*" -exec rm -rf {} \;  2>/dev/null || true
+	# sphinx-build -M clean docs docs/_build This needs to be fixed.
 
 # Build the HTML documentation using Sphinx
 .PHONY: docs

diff --git a/README.md b/README.md
@@ -6,15 +6,40 @@
 
 # FlowCept
 
-FlowCept is a runtime data integration system that empowers any data processing system to capture and query workflow provenance data using data observability, requiring minimal or no changes in the target system code. It seamlessly integrates data from multiple workflows, enabling users to comprehend complex, heterogeneous, and large-scale data from various sources in federated environments.
-
-FlowCept is intended to address scenarios where multiple workflows in a science campaign or in an enterprise run and generate important data to be analyzed in an integrated manner. Since these workflows may use different data manipulation tools (e.g., provenance or lineage capture tools, database systems, performance profiling tools) or can be executed within different parallel computing systems (e.g., Dask, Spark, Workflow Management Systems), its key differentiator is the capability to seamless and automatically integrate data from various workflows using data observability. It builds an integrated data view at runtime enabling end-to-end exploratory data analysis and monitoring. It follows [W3C PROV](https://www.w3.org/TR/prov-overview/) recommendations for its data schema. It does not require changes in user codes or systems (i.e., instrumentation). All users need to do is to create adapters for their systems or tools, if one is not available yet. In addition to observability, we provide instrumentation options for convenience. For example, by adding a `@flowcept_task` decorator on functions, FlowCept will observe their executions when they run. Also, we provide special features for PyTorch modules. Adding `@torch_task` to them will enable extra model inspection to be captured and integrated in the database at runtime.
+FlowCept is a runtime data integration system that enables any data processing system to capture and query workflow provenance with minimal or no code changes. It integrates data across workflows, providing insights into complex, large-scale, and heterogeneous data in federated environments. It has additional features if there are Machine Learning (ML) workflows involved. 
+
+FlowCept is designed for scenarios where multiple workflows generate critical data requiring integrated analysis. These workflows may use diverse tools (e.g., provenance capture, databases, performance profiling, ML frameworks) or run on different data processing systems. FlowCept’s key capability is to seamlessly integrate data using observability, creating a unified data view at runtime for end-to-end analysis and monitoring.
+
+Other capabilities include:
+
+- Automatic multi-workflow provenance data capture;
+- Data observability, enabling minimal intrusion to user workflows;
+- Explicit user workflow instrumentation, if this is preferred over data observability;
+- ML data capture in various levels of details: workflow, model fitting or evaluation task, epoch iteration, layer forwarding;
+- ML model management;
+- Adapter-based system architecture, making it easy to plug and play with different data processing systems and backend database (e.g., MongoDB) or MQ services (e.g., Redis, Kafka);
+- Low-overhead focused system architecture, to avoid adding performance overhead particularly to workloads that run on HPC machines;
+- Telemetry data capture (e.g., CPU, GPU, Memory consumption) linked to the application dataflow;
+- Highly customizable to multiple use cases, enabling easy toggle between settings (e.g., with/without provenance capture; with/without telemetry and which telemetry type to capture; which adapters or backend services to run with); 
+- [W3C PROV](https://www.w3.org/TR/prov-overview/) adherence;
+
+Notes:
 
-Currently, FlowCept provides adapters for: [Dask](https://www.dask.org/), [MLFlow](https://mlflow.org/), [TensorBoard](https://www.tensorflow.org/tensorboard), and [Zambeze](https://github.com/ORNL/zambeze). 
+- Currently implemented data observability adapters:
+  - MLFlow
+  - Dask
+  - TensorBoard
+- Python scripts can be easily instrumented via `@decorators` using `@flowcept_task` (for generic Python method) or `@torch_task` (for methods that encapsulate PyTorch model manipulation, such as training or evaluation). 
+- Currently supported MQ systems:
+  - Kafka
+  - Redis
+- Currently supported database systems:
+  - MongoDB
+  - Lightning Memory-Mapped Database (lightweight file-only database system)
 
-See the [Jupyter Notebooks](notebooks) and [Examples](examples) for utilization examples.
+Explore [Jupyter Notebooks](notebooks) and [Examples](examples) for usage.
 
-See the [Contributing](CONTRIBUTING.md) file for guidelines to contribute with new adapters. Note that we may use the term 'plugin' in the codebase as a synonym to adapter. Future releases should standardize the terminology to use adapter.
+Refer to [Contributing](CONTRIBUTING.md) for adding new adapters. Note: The term "plugin" in the codebase is synonymous with "adapter," and future updates will standardize terminology.
 
 ## Install and Setup:
 
@@ -47,6 +72,37 @@ For convenience, the default needed services can be started using a [docker-comp
 
 4. See the [Jupyter Notebooks](notebooks) and [Examples directory](examples) for utilization examples.
 
+## Installing and Running with Docker
+
+To use containers instead of installing FlowCept's dependencies on your host system, we provide a [Dockerfile](deployment/Dockerfile) alongside a [docker-compose.yml](deployment/compose.yml) for dependent services (e.g., Redis, MongoDB).  
+
+#### Notes:  
+- As seen in the steps below, there are [Makefile](Makefile) commands to build and run the image. Please use them instead of running the Docker commands to build and run the image.
+- The Dockerfile builds from a local `miniconda` image, which will be built first using the [build-image.sh](deployment/build-image.sh) script.  
+- All dependencies for all adapters are installed, increasing build time. Edit the Dockerfile to customize dependencies based on our [pyproject.toml](pyproject.toml) to reduce build time if needed.  
+
+#### Steps:
+
+1. Build the Docker image:  
+    ```bash
+    make build
+    ```
+
+2. Start dependent services:
+    ```bash
+    make services
+    ```
+
+3. Run the image interactively:
+    ```bash
+    make run
+    ```
+
+4. Optionally, run Unit tests in the container:
+    ```bash
+    make tests-in-container
+    ```
+
 ### Simple Example with Decorators Instrumentation
 
 In addition to existing adapters to Dask, MLFlow, and others (it's extensible for any system that generates data), FlowCept also offers instrumentation via @decorators. 

diff --git a/deployment/compose-kafka.yml b/deployment/compose-kafka.yml
@@ -9,10 +9,10 @@ services:
   flowcept_mongo:
     container_name: flowcept_mongo
     image: mongo:latest
-    # volumes:
-    #   - /Users/rsr/Downloads/mongo_data/db:/data/db
     ports:
       - 27017:27017
+    volumes:
+      - mongo_data:/data/db
 
   zookeeper:
     image: confluentinc/cp-zookeeper:6.1.1
@@ -54,3 +54,10 @@ services:
       echo -e 'Successfully created the following topics:'
       kafka-topics --bootstrap-server kafka:29092 --list
       "
+
+networks:
+  flowcept:
+    driver: bridge
+
+volumes:
+  mongo_data:
diff --git a/deployment/compose.yml b/deployment/compose.yml
@@ -11,11 +11,16 @@ services:
     image: mongo:latest
     ports:
       - 27017:27017
+    volumes:
+      - mongo_data:/data/db
 
 networks:
   flowcept:
     driver: bridge
 
+volumes:
+  mongo_data:
+
 # This is just for the cases where one does not want to use the same Redis instance for caching and messaging, but
 # it's not required to have separate instances.
 #  local_interceptor_cache: