-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add DataDriftTrigger: supports one Evidently metric (#409)
This is a clean version of PR#367. 1. Add DataDriftTrigger class to supervisor. Supports one configurable Evidently metric. Launches drift detection every N new data points. Data used in detection are data trained in the previous trigger and all the untriggered new data. 2. Update Trigger interface. `Trigger.inform()` returns a Generator instead of List. 3. Add a generic ModelDownloader in supervisor. 4. Add example pipelines using DataDriftTrigger. 5. Add Evidently to pylint known third party. 6. Change ModelDownloader to embedding encoder utils. The downloader sets up and returns the model. The DataDriftTrigger owns the model. Future 1. Support multiple configurable Evidently metric. #416 2. Support Alibi-Detect. #414 3. Support custom embedding encoder. #417 4. Support different windowing for detection data, e.g. compare with all previously trained data. #418 5. Common DataLoaderInfo #415
- Loading branch information
1 parent
e176778
commit 4a34044
Showing
25 changed files
with
1,709 additions
and
51 deletions.
There are no files selected for viewing
71 changes: 71 additions & 0 deletions
71
benchmark/wildtime_benchmarks/example_pipelines/data_drift_trigger/arxiv_datadrift.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
pipeline: | ||
name: ArXiv dataset Test Pipeline | ||
description: Example pipeline | ||
version: 1.0.0 | ||
model: | ||
id: ArticleNet | ||
config: | ||
num_classes: 172 | ||
model_storage: | ||
full_model_strategy: | ||
name: "PyTorchFullModel" | ||
training: | ||
gpus: 1 | ||
device: "cuda:0" | ||
dataloader_workers: 2 | ||
use_previous_model: True | ||
initial_model: random | ||
batch_size: 96 | ||
optimizers: | ||
- name: "default" | ||
algorithm: "SGD" | ||
source: "PyTorch" | ||
param_groups: | ||
- module: "model" | ||
config: | ||
lr: 0.00002 | ||
momentum: 0.9 | ||
weight_decay: 0.01 | ||
optimization_criterion: | ||
name: "CrossEntropyLoss" | ||
checkpointing: | ||
activated: False | ||
selection_strategy: | ||
name: NewDataStrategy | ||
maximum_keys_in_memory: 10000 | ||
config: | ||
storage_backend: "database" | ||
limit: -1 | ||
reset_after_trigger: True | ||
seed: 42 | ||
epochs_per_trigger: 1 | ||
data: | ||
dataset_id: arxiv_train | ||
bytes_parser_function: | | ||
def bytes_parser_function(data: bytes) -> str: | ||
return str(data, "utf8") | ||
tokenizer: DistilBertTokenizerTransform | ||
|
||
trigger: | ||
id: DataDriftTrigger | ||
trigger_config: | ||
data_points_for_detection: 100000 | ||
sample_size: 5000 | ||
|
||
evaluation: | ||
device: "cuda:0" | ||
result_writers: ["json"] | ||
datasets: | ||
- dataset_id: arxiv_test | ||
bytes_parser_function: | | ||
def bytes_parser_function(data: bytes) -> str: | ||
return str(data, "utf8") | ||
tokenizer: DistilBertTokenizerTransform | ||
batch_size: 96 | ||
dataloader_workers: 2 | ||
metrics: | ||
- name: "Accuracy" | ||
evaluation_transformer_function: | | ||
import torch | ||
def evaluation_transformer_function(model_output: torch.Tensor) -> torch.Tensor: | ||
return torch.argmax(model_output, dim=-1) |
73 changes: 73 additions & 0 deletions
73
benchmark/wildtime_benchmarks/example_pipelines/data_drift_trigger/huffpost_datadrift.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
pipeline: | ||
name: Huffpost dataset Test Pipeline | ||
description: Example pipeline | ||
version: 1.0.0 | ||
model: | ||
id: ArticleNet | ||
config: | ||
num_classes: 55 | ||
model_storage: | ||
full_model_strategy: | ||
name: "PyTorchFullModel" | ||
training: | ||
gpus: 1 | ||
device: "cuda:0" | ||
dataloader_workers: 2 | ||
use_previous_model: True | ||
initial_model: random | ||
batch_size: 64 | ||
optimizers: | ||
- name: "default" | ||
algorithm: "SGD" | ||
source: "PyTorch" | ||
param_groups: | ||
- module: "model" | ||
config: | ||
lr: 0.00002 | ||
momentum: 0.9 | ||
weight_decay: 0.01 | ||
optimization_criterion: | ||
name: "CrossEntropyLoss" | ||
checkpointing: | ||
activated: False | ||
selection_strategy: | ||
name: NewDataStrategy | ||
maximum_keys_in_memory: 1000 | ||
config: | ||
storage_backend: "database" | ||
limit: -1 | ||
reset_after_trigger: True | ||
seed: 42 | ||
epochs_per_trigger: 1 | ||
data: | ||
dataset_id: huffpost_train | ||
bytes_parser_function: | | ||
def bytes_parser_function(data: bytes) -> str: | ||
return str(data, "utf8") | ||
tokenizer: DistilBertTokenizerTransform | ||
|
||
trigger: | ||
id: DataDriftTrigger | ||
trigger_config: | ||
data_points_for_detection: 5000 | ||
metric_name: mmd | ||
metric_config: | ||
bootstrap: False | ||
|
||
evaluation: | ||
device: "cuda:0" | ||
result_writers: ["json"] | ||
datasets: | ||
- dataset_id: huffpost_test | ||
bytes_parser_function: | | ||
def bytes_parser_function(data: bytes) -> str: | ||
return str(data, "utf8") | ||
tokenizer: DistilBertTokenizerTransform | ||
batch_size: 64 | ||
dataloader_workers: 2 | ||
metrics: | ||
- name: "Accuracy" | ||
evaluation_transformer_function: | | ||
import torch | ||
def evaluation_transformer_function(model_output: torch.Tensor) -> torch.Tensor: | ||
return torch.argmax(model_output, dim=-1) |
76 changes: 76 additions & 0 deletions
76
benchmark/wildtime_benchmarks/example_pipelines/data_drift_trigger/yearbook_datadrift.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
pipeline: | ||
name: Yearbook Test Pipeline | ||
description: Example pipeline | ||
version: 1.0.0 | ||
model: | ||
id: YearbookNet | ||
config: | ||
num_input_channels: 1 | ||
num_classes: 2 | ||
model_storage: | ||
full_model_strategy: | ||
name: "PyTorchFullModel" | ||
training: | ||
gpus: 1 | ||
device: "cuda:0" | ||
dataloader_workers: 2 | ||
use_previous_model: True | ||
initial_model: random | ||
batch_size: 64 | ||
optimizers: | ||
- name: "default" | ||
algorithm: "SGD" | ||
source: "PyTorch" | ||
param_groups: | ||
- module: "model" | ||
config: | ||
lr: 0.001 | ||
momentum: 0.9 | ||
optimization_criterion: | ||
name: "CrossEntropyLoss" | ||
checkpointing: | ||
activated: False | ||
selection_strategy: | ||
name: NewDataStrategy | ||
maximum_keys_in_memory: 1000 | ||
config: | ||
storage_backend: "database" | ||
limit: -1 | ||
reset_after_trigger: True | ||
seed: 42 | ||
epochs_per_trigger: 1 | ||
data: | ||
dataset_id: yearbook_train | ||
transformations: [] | ||
bytes_parser_function: | | ||
import torch | ||
import numpy as np | ||
def bytes_parser_function(data: bytes) -> torch.Tensor: | ||
return torch.from_numpy(np.frombuffer(data, dtype=np.float32)).reshape(1, 32, 32) | ||
trigger: | ||
id: DataDriftTrigger | ||
trigger_config: | ||
data_points_for_detection: 1000 | ||
metric_name: model | ||
metric_config: | ||
threshold: 0.7 | ||
|
||
evaluation: | ||
device: "cuda:0" | ||
result_writers: ["json"] | ||
datasets: | ||
- dataset_id: yearbook_test | ||
bytes_parser_function: | | ||
import torch | ||
import numpy as np | ||
def bytes_parser_function(data: bytes) -> torch.Tensor: | ||
return torch.from_numpy(np.frombuffer(data, dtype=np.float32)).reshape(1, 32, 32) | ||
batch_size: 64 | ||
dataloader_workers: 2 | ||
metrics: | ||
- name: "Accuracy" | ||
evaluation_transformer_function: | | ||
import torch | ||
def evaluation_transformer_function(model_output: torch.Tensor) -> torch.Tensor: | ||
return torch.argmax(model_output, dim=-1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.