Skip to content

Commit

Permalink
Add pae outlier detection (working with tabular data) #13
Browse files Browse the repository at this point in the history
  • Loading branch information
bdubayah committed Jun 21, 2021
1 parent 286d0f4 commit c69c1ea
Show file tree
Hide file tree
Showing 7 changed files with 118 additions and 645 deletions.
5 changes: 5 additions & 0 deletions dora_exp_pipeline/dora_exp.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from dora_exp_pipeline.random_outlier_detection import RandomOutlierDetection
from dora_exp_pipeline.negative_sampling_outlier_detection import \
NegativeSamplingOutlierDetection
from dora_exp_pipeline.pae_outlier_detection import PAEOutlierDetection
from dora_exp_pipeline.util import LogUtil
from dora_exp_pipeline.dora_feature import extract_feature
from dora_exp_pipeline.outlier_detection import get_alg_by_name
Expand Down Expand Up @@ -55,6 +56,10 @@ def register_od_algs():
negative_sampling_outlier_detection = NegativeSamplingOutlierDetection()
register_od_alg(negative_sampling_outlier_detection)

# Register PAE outlier detection algorithm in the pool
pae_outlier_detection = PAEOutlierDetection()
register_od_alg(pae_outlier_detection)


def start(config_file: str, log_file=None, seed=1234):

Expand Down
5 changes: 4 additions & 1 deletion dora_exp_pipeline/example_config/dora_time_series.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,10 @@ outlier_detection: {
# negative_sampling: {
# percent_increase: 20
# },
random: {}
# random: {},
pae: {
latent_dim: 3
}
}

# Results organization module
Expand Down
106 changes: 106 additions & 0 deletions dora_exp_pipeline/pae_outlier_detection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
from dora_exp_pipeline.outlier_detection import OutlierDetection
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, losses
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow_probability import distributions, bijectors, layers as tfpl
from sklearn.model_selection import train_test_split


class PAEOutlierDetection(OutlierDetection):
def __init__(self):
super(PAEOutlierDetection, self).__init__('pae')

def _rank_internal(self, data_to_fit, data_to_score, seed,
latent_dim):
if latent_dim < 1:
raise RuntimeError('The dimensionality of the latent space must be '
'>= 1')

# Check that the number of hidden layers <= number of features
if latent_dim > data_to_fit.shape[1]:
raise RuntimeError(f'The dimensionality of the latent space'
f'(latent_dim = {latent_dim}) '
f'must be <= number of features '
f'({data_to_fit.shape[1]})')

# Rank targets
return train_and_run_PAE(data_to_fit, data_to_score, latent_dim)


def train_and_run_PAE(train, test, latent_dim):
# Train autoencoder
autoencoder = Autoencoder(latent_dim, train.shape[1])
autoencoder.compile(optimizer='adam', loss=losses.MeanSquaredError())
callback = EarlyStopping(monitor='val_loss', patience=3)
autoencoder.fit(
train,
train,
epochs=500,
callbacks=[callback],
validation_split=0.25,
shuffle=True
)

# Train flow
encoded_train = autoencoder.encoder(train).numpy()
flow = NormalizingFlow(latent_dim)
flow.compile(optimizer='adam', loss=lambda y, rv_y: -rv_y.log_prob(y))
callback = EarlyStopping(monitor='val_loss', patience=3)
flow.fit(
np.zeros((len(encoded_train), 0)),
encoded_train,
epochs=500,
callbacks=[callback],
validation_split=0.25,
shuffle=True
)

# Calculate scores
trained_dist = flow.dist(np.zeros(0,))
encoded_test = autoencoder.encoder(test).numpy()
log_probs = trained_dist.log_prob(encoded_test).numpy()
novelty_scores = np.negative(log_probs)

return novelty_scores


class Autoencoder(Model):
def __init__(self, latent_dim, input_dim):
super(Autoencoder, self).__init__()
self.encoder = keras.Sequential(
[
layers.Dense(latent_dim, activation='relu')
]
)
self.decoder = keras.Sequential(
[
layers.Dense(input_dim, activation='sigmoid')
]
)

def call(self, x):
encoded = self.encoder(x)
decoded = self.decoder(encoded)
return decoded


class NormalizingFlow(Model):
def __init__(self, latent_dim):
super(NormalizingFlow, self).__init__()
self.dist = keras.Sequential(
[
layers.InputLayer(input_shape=(0,), dtype=tf.float32),
tfpl.DistributionLambda(lambda t:
distributions.MultivariateNormalDiag(
loc=tf.zeros(tf.concat(
[tf.shape(t)[:-1], [latent_dim]], axis=0)))),
tfpl.AutoregressiveTransform(bijectors.AutoregressiveNetwork(
params=2, hidden_units=[10, 10], activation='relu')),
]
)

def call(self, x):
return self.dist(x)
15 changes: 0 additions & 15 deletions pae/README.md

This file was deleted.

14 changes: 0 additions & 14 deletions pae/dora-pae-env.yml

This file was deleted.

614 changes: 0 additions & 614 deletions pae/pae.ipynb

This file was deleted.

4 changes: 3 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@
'scikit-learn==0.24.2',
'Pillow==8.2.0',
'planetaryimage==0.5.0',
'PyYAML==5.4.1'
'PyYAML==5.4.1',
'tensorflow==2.5.0',
'tensorflow-probability==0.13.0rc0'
],
provide=[
'dora_exp_pipeline'
Expand Down

0 comments on commit c69c1ea

Please sign in to comment.