Skip to content

Commit

Permalink
Changes to ml decorator tests
Browse files Browse the repository at this point in the history
  • Loading branch information
renan-souza committed Dec 17, 2024
1 parent 1f7db22 commit ed2794e
Show file tree
Hide file tree
Showing 16 changed files with 102 additions and 664 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/create-release-n-publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ jobs:
run: make services

- name: Test with pytest
run: pytest --ignore=tests/decorator_tests/ml_tests/llm_tests
run: pytest

- name: Test notebooks
run: |
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/run-tests-kafka.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ jobs:
make tests
- name: Test notebooks
run: pytest --ignore=notebooks/zambeze.ipynb --nbmake "notebooks/" --nbmake-timeout=600 --ignore=notebooks/dask_from_CLI.ipynb
run: pytest --nbmake "notebooks/" --nbmake-timeout=600 --ignore=notebooks/dask_from_CLI.ipynb

- name: Stop services
run: docker compose -f deployment/compose-kafka.yml down
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/run-tests-py11.yml
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ jobs:
export MQ_TYPE=kafka
export MQ_PORT=9092
# Ignoring heavy tests. They are executed with Kafka in another GH Action.
pytest --ignore=tests/decorator_tests/ml_tests --ignore=tests/adapters/test_tensorboard.py
pytest
- name: Stop services
run: docker compose -f deployment/compose-kafka.yml down
Expand Down
24 changes: 23 additions & 1 deletion .github/workflows/run_examples.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,24 @@
set -e
set -o pipefail

# Display usage/help message
usage() {
echo -e "\nUsage: $0 <examples_dir> <with_mongo>\n"
echo "Arguments:"
echo " examples_dir Path to the examples directory (Mandatory)"
echo " with_mongo Boolean flag (true/false) indicating whether to include MongoDB support (Mandatory)"
echo -e "\nExample:"
echo " $0 examples true"
echo " $0 examples false"
exit 1
}

# Check if the required arguments are provided
if [[ -z "$1" || -z "$2" ]]; then
echo "Error: Missing mandatory arguments!"
usage
fi

# Function to run tests with common steps
run_test() {
test_path="${EXAMPLES_DIR}/${1}_example.py"
Expand All @@ -29,6 +47,10 @@ run_test() {
elif [[ "$test_type" =~ "tensorboard" ]]; then
echo "Installing tensorboard"
pip install .[tensorboard] > /dev/null 2>&1
elif [[ "$test_type" =~ "llm_complex" ]]; then
echo "Defining python path for llm_complex..."
export PYTHONPATH=$PYTHONPATH:${EXAMPLES_DIR}/llm_complex
echo $PYTHONPATH
fi

# Run the test and capture output
Expand All @@ -51,7 +73,7 @@ echo "Using examples directory: $EXAMPLES_DIR"
echo "With Mongo? ${WITH_MONGO}"

# Define the test cases
tests=("instrumented_simple" "instrumented_loop" "dask" "mlflow" "tensorboard")
tests=("instrumented_simple" "instrumented_loop" "dask" "mlflow" "tensorboard" "llm_complex/llm_search")

# Iterate over the tests and run them
for test_ in "${tests[@]}"; do
Expand Down
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,5 @@ test.py
time.txt
tmp/
deployment/data
examples/llm_complex/output_data
**/*output_data*

10 changes: 6 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ help:
@printf "\033[32mtests\033[0m run unit tests with pytest\n"
@printf "\033[32mtests-in-container\033[0m run unit tests with pytest inside Flowcept's container\n"
@printf "\033[32mtests-in-container-mongo\033[0m run unit tests inside container with MongoDB\n"
@printf "\033[32mtests-in-container-kafka\033[0m run unit tests inside container with Kafka and MongoDB\n"
@printf "\033[32mtests-all\033[0m run all unit tests with pytest, including long-running ones\n"
@printf "\033[32mtests-notebooks\033[0m test the notebooks using pytest\n"
@printf "\033[32mclean\033[0m remove cache directories and Sphinx build output\n"
Expand Down Expand Up @@ -43,6 +44,7 @@ clean:
find . -type d -name "mlruns" -exec rm -rf {} \; 2>/dev/null || true
find . -type d -name "__pycache__" -exec rm -rf {} \; 2>/dev/null || true
find . -type d -name "*tfevents*" -exec rm -rf {} \; 2>/dev/null || true
find . -type d -name "*output_data*" -exec rm -rf {} \; 2>/dev/null || true
# sphinx-build -M clean docs docs/_build This needs to be fixed.

# Build the HTML documentation using Sphinx
Expand Down Expand Up @@ -73,13 +75,13 @@ run:
docker run --rm -v $(shell pwd):/flowcept -e KVDB_HOST=flowcept_redis -e MQ_HOST=flowcept_redis -e MONGO_HOST=flowcept_mongo --network flowcept_default -it flowcept

tests-in-container-mongo:
docker run --rm -v $(shell pwd):/flowcept -e KVDB_HOST=flowcept_redis -e MQ_HOST=flowcept_redis -e MONGO_HOST=flowcept_mongo -e MONGO_ENABLED=true -e LMDB_ENABLED=false --network flowcept_default flowcept /opt/conda/envs/flowcept/bin/pytest --ignore=tests/decorator_tests/ml_tests
docker run --rm -v $(shell pwd):/flowcept -e KVDB_HOST=flowcept_redis -e MQ_HOST=flowcept_redis -e MONGO_HOST=flowcept_mongo -e MONGO_ENABLED=true -e LMDB_ENABLED=false --network flowcept_default flowcept /opt/conda/envs/flowcept/bin/pytest

tests-in-container:
docker run --rm -v $(shell pwd):/flowcept -e KVDB_HOST=flowcept_redis -e MQ_HOST=flowcept_redis -e MONGO_ENABLED=false -e LMDB_ENABLED=true --network flowcept_default flowcept /opt/conda/envs/flowcept/bin/pytest --ignore=tests/decorator_tests/ml_tests
docker run --rm -v $(shell pwd):/flowcept -e KVDB_HOST=flowcept_redis -e MQ_HOST=flowcept_redis -e MONGO_ENABLED=false -e LMDB_ENABLED=true --network flowcept_default flowcept /opt/conda/envs/flowcept/bin/pytest

tests-in-container-kafka:
docker run --rm -v $(shell pwd):/flowcept -e KVDB_HOST=flowcept_redis -e MQ_HOST=kafka -e MONGO_HOST=flowcept_mongo -e MQ_PORT=29092 -e MQ_TYPE=kafka -e MONGO_ENABLED=true -e LMDB_ENABLED=false --network flowcept_default flowcept /opt/conda/envs/flowcept/bin/pytest --ignore=tests/decorator_tests/ml_tests
docker run --rm -v $(shell pwd):/flowcept -e KVDB_HOST=flowcept_redis -e MQ_HOST=kafka -e MONGO_HOST=flowcept_mongo -e MQ_PORT=29092 -e MQ_TYPE=kafka -e MONGO_ENABLED=true -e LMDB_ENABLED=false --network flowcept_default flowcept /opt/conda/envs/flowcept/bin/pytest

# This command can be removed once we have our CLI
liveness:
Expand All @@ -88,7 +90,7 @@ liveness:
# Run unit tests using pytest
.PHONY: tests
tests:
pytest --ignore=tests/decorator_tests/ml_tests/llm_tests
pytest

.PHONY: tests-notebooks
tests-notebooks:
Expand Down
File renamed without changes.
2 changes: 1 addition & 1 deletion src/flowcept/instrumentation/decorators/flowcept_torch.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ def _register_as_workflow(self):
workflow_obj.name = cls.__name__
workflow_obj.campaign_id = self._campaign_id
workflow_obj.parent_workflow_id = self._parent_workflow_id
_custom_metadata = self._custom_metadata
_custom_metadata = self._custom_metadata or {}
_custom_metadata["workflow_type"] = "TorchModule"

if self._should_get_profile:
Expand Down
7 changes: 7 additions & 0 deletions tests/adapters/test_tensorboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,13 @@ def run_tensorboard_hparam_tuning(self):
(x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

# Reduce the dataset size for faster debugging
DEBUG_SAMPLES_TRAIN = 100 # Number of training samples to keep
DEBUG_SAMPLES_TEST = 20 # Number of test samples to keep

x_train, y_train = x_train[:DEBUG_SAMPLES_TRAIN], y_train[:DEBUG_SAMPLES_TRAIN]
x_test, y_test = x_test[:DEBUG_SAMPLES_TEST], y_test[:DEBUG_SAMPLES_TEST]

HP_NUM_UNITS = hp.HParam("num_units", hp.Discrete([16]))
HP_DROPOUT = hp.HParam("dropout", hp.RealInterval(0.1, 0.2))
HP_OPTIMIZER = hp.HParam("optimizer", hp.Discrete(["adam", "sgd"]))
Expand Down
105 changes: 41 additions & 64 deletions tests/decorator_tests/ml_tests/dl_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,26 +4,20 @@
from torch.utils.data import Subset, DataLoader
from torchvision import datasets, transforms
from torch import nn, optim
from torch.nn import functional as F

from torch.nn import functional as F, Conv2d, Dropout, MaxPool2d, ReLU, Linear, Softmax

from flowcept import (
Flowcept,
)
from flowcept.instrumentation.decorators.flowcept_torch import (
register_modules,
register_module_as_workflow,
torch_task,
)
from flowcept.instrumentation.decorators.responsible_ai import (
model_profiler,
)

import threading

from flowcept.instrumentation.decorators.flowcept_torch import flowcept_torch

thread_state = threading.local()


@flowcept_torch
class MyNet(nn.Module):
def __init__(
self,
Expand All @@ -35,22 +29,9 @@ def __init__(
parent_workflow_id=None,
parent_task_id=None,
):
super(MyNet, self).__init__()
super().__init__()
print("parent workflow id", parent_workflow_id)
self.workflow_id = register_module_as_workflow(self, parent_workflow_id=parent_workflow_id)
self.parent_task_id = parent_task_id
Conv2d, Dropout, MaxPool2d, ReLU, Softmax, Linear = register_modules(
[
nn.Conv2d,
nn.Dropout,
nn.MaxPool2d,
nn.ReLU,
nn.Softmax,
nn.Linear,
],
workflow_id=self.workflow_id,
parent_task_id=self.parent_task_id,
)

self.model_type = "CNN"
# TODO: add if len conv_in_outs > 0
Expand Down Expand Up @@ -79,7 +60,6 @@ def __init__(
self.fc_layers.append(Softmax(dim=softmax_dims[i]))
self.view_size = fc_in_outs[0][0]

@torch_task()
def forward(self, x):
x = self.conv_layers(x)
x = x.view(-1, self.view_size)
Expand All @@ -89,7 +69,7 @@ def forward(self, x):

class ModelTrainer(object):
@staticmethod
def build_train_test_loader(batch_size=128, random_seed=0, debug=True, subset_size=1000):
def build_train_test_loader(batch_size=128, random_seed=0, debug=True, subset_size=10):
torch.manual_seed(random_seed)

# Load the full MNIST dataset
Expand Down Expand Up @@ -157,7 +137,6 @@ def _test(model, device, test_loader):

# @model_explainer()
@staticmethod
@model_profiler()
def model_fit(
conv_in_outs=[[1, 10], [10, 20]],
conv_kernel_sizes=[5, 5],
Expand All @@ -181,43 +160,41 @@ def model_fit(
# We are calling the consumer api here (sometimes for the second time)
# because we are capturing at two levels: at the model.fit and at
# every layer. Can we do it better?
with Flowcept(
bundle_exec_id=workflow_id,
start_persistence=False,
):
train_loader, test_loader = ModelTrainer.build_train_test_loader()
if torch.backends.mps.is_available():
device = torch.device("mps")
else:
device = torch.device("cpu")
model = MyNet(
conv_in_outs=conv_in_outs,
conv_kernel_sizes=conv_kernel_sizes,
conv_pool_sizes=conv_pool_sizes,
fc_in_outs=fc_in_outs,
softmax_dims=softmax_dims,
parent_workflow_id=workflow_id,
)
model = model.to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
test_info = {}
print("Starting training....")
for epoch in range(1, max_epochs + 1):
ModelTrainer._train(model, device, train_loader, optimizer, epoch)
test_info = ModelTrainer._test(model, device, test_loader)
print("Finished training....")
batch = next(iter(test_loader))
test_data, _ = batch
result = test_info.copy()
result.update(
{
"model": model,
"test_data": test_data,
"task_id": task_id,
"random_seed": random_seed,
}
)
return result
train_loader, test_loader = ModelTrainer.build_train_test_loader()
if torch.backends.mps.is_available():
device = torch.device("mps")
else:
device = torch.device("cpu")
model = MyNet(
conv_in_outs=conv_in_outs,
conv_kernel_sizes=conv_kernel_sizes,
conv_pool_sizes=conv_pool_sizes,
fc_in_outs=fc_in_outs,
softmax_dims=softmax_dims,
parent_workflow_id=workflow_id,
)
model = model.to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
test_info = {}
print("Starting training....")
for epoch in range(1, max_epochs + 1):
ModelTrainer._train(model, device, train_loader, optimizer, epoch)
test_info = ModelTrainer._test(model, device, test_loader)
print("Finished training....")
batch = next(iter(test_loader))
test_data, _ = batch
result = test_info.copy()

best_obj_id = Flowcept.db.save_torch_model(model, task_id=task_id, workflow_id=workflow_id, custom_metadata=result)
result.update(
{
"best_obj_id": best_obj_id,
"test_data": test_data,
"task_id": task_id,
"random_seed": random_seed,
}
)
return result

@staticmethod
def generate_hp_confs(hp_conf: dict):
Expand Down
Empty file.
Loading

0 comments on commit ed2794e

Please sign in to comment.