-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
6 changed files
with
251 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,11 +3,16 @@ uuid = "6470b2ab-4fe8-498e-808d-6badd5c3da38" | |
authors = ["David Muhr <[email protected]> and contributors"] | ||
version = "0.1.0" | ||
|
||
[deps] | ||
OutlierDetectionInterface = "1722ece6-f894-4ffc-b6be-6ca1174e2011" | ||
|
||
[compat] | ||
julia = "1" | ||
OutlierDetectionInterface = "0.1" | ||
|
||
[extras] | ||
OutlierDetectionTest = "66620973-d34b-445b-a614-4040704cad69" | ||
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" | ||
|
||
[targets] | ||
test = ["Test"] | ||
test = ["Test", "OutlierDetectionTest"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,6 @@ | ||
# OutlierDetectionTrees | ||
|
||
[![Documentation (stable)](https://img.shields.io/badge/docs-stable-blue.svg)](https://OutlierDetectionJL.github.io/OutlierDetection.jl/stable) | ||
[![Documentation (dev)](https://img.shields.io/badge/docs-dev-blue.svg)](https://OutlierDetectionJL.github.io/OutlierDetection.jl/dev) | ||
[![Build Status](https://github.com/OutlierDetectionJL/OutlierDetectionTrees.jl/actions/workflows/CI.yml/badge.svg?branch=master)](https://github.com/OutlierDetectionJL/OutlierDetectionTrees.jl/actions/workflows/CI.yml?query=branch%3Amaster) | ||
[![Coverage](https://codecov.io/gh/OutlierDetectionJL/OutlierDetectionTrees.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/OutlierDetectionJL/OutlierDetectionTrees.jl) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,176 @@ | ||
# adapted from LibIsolationForest | ||
# see: https://github.com/msimms/LibIsolationForest/blob/master/julia/IsolationForest.jl | ||
module IsolationForest | ||
|
||
Feature = AbstractVector{T} where T <: AbstractFloat | ||
Dataset = AbstractVector{<:Feature} | ||
|
||
# Tree node, used internally. | ||
mutable struct Node | ||
featureName::Int | ||
splitValue::AbstractFloat | ||
left::Union{Node, Nothing} | ||
right::Union{Node, Nothing} | ||
end | ||
|
||
# Isolation Forest. | ||
mutable struct Forest | ||
numTrees::Int | ||
subSamplingSize::Int | ||
numFeatures::Int | ||
trees::AbstractArray{Union{Node, Nothing}} | ||
function Forest(numTrees, subSamplingSize, numFeatures, featureValues) | ||
forest = new(numTrees, subSamplingSize, numFeatures, []) | ||
for i = 1:forest.numTrees | ||
featureValues = deepcopy(featureValues) | ||
tree = create_tree(forest, featureValues, 0) | ||
if tree !== nothing | ||
push!(forest.trees, tree) | ||
end | ||
end | ||
return forest | ||
end | ||
end | ||
|
||
# Creates and returns a single tree. As this is a recursive function, depth indicates the current depth of the recursion. | ||
function create_tree(forest::Forest, feature_values::Dataset, depth::Int) | ||
# Sanity check | ||
if forest.numFeatures <= 1 | ||
return nothing | ||
end | ||
|
||
# If we've exceeded the maximum desired depth, then stop. | ||
if (forest.subSamplingSize > 0) && (depth >= forest.subSamplingSize) | ||
return nothing | ||
end | ||
|
||
# Randomly select a feature. | ||
randomly_selected_feature = rand(1:forest.numFeatures) | ||
|
||
# Randomly select a split value. | ||
feature_value_set = feature_values[randomly_selected_feature] | ||
feature_value_set_len = length(feature_value_set) | ||
|
||
if feature_value_set_len <= 1 | ||
return nothing | ||
end | ||
split_value_index = rand(1:feature_value_set_len) | ||
split_value = feature_value_set[split_value_index] | ||
|
||
# Create a tree node to hold the split value. | ||
tree = Node(randomly_selected_feature, split_value, nothing, nothing) | ||
|
||
# Create two versions of the feature value set that we just used, | ||
# one for the left side of the tree and one for the right. | ||
temp_feature_values = feature_values | ||
|
||
# Create the left subtree. | ||
left_features = feature_value_set[1:split_value_index] | ||
|
||
temp_feature_values[randomly_selected_feature] = left_features | ||
tree.left = IsolationForest.create_tree(forest, temp_feature_values, depth + 1) | ||
|
||
# Create the right subtree. | ||
if split_value_index + 1 < feature_value_set_len | ||
right_features = feature_value_set[split_value_index + 1:feature_value_set_len] | ||
temp_feature_values[randomly_selected_feature] = right_features | ||
tree.right = IsolationForest.create_tree(forest, temp_feature_values, depth + 1) | ||
end | ||
|
||
return tree | ||
end | ||
|
||
# Scores the sample against the specified tree. | ||
function score_sample_against_tree(tree::Node, features::Feature) | ||
depth = 0.0 | ||
current_node = tree | ||
|
||
while current_node !== nothing | ||
found_feature = false | ||
|
||
# Find the next feature in the sample. | ||
for (current_feature_name, current_feature_value) in enumerate(features) | ||
|
||
# If the current node has the feature in question. | ||
if current_feature_name == current_node.featureName | ||
if current_feature_value < current_node.splitValue | ||
current_node = current_node.left | ||
else | ||
current_node = current_node.right | ||
end | ||
|
||
depth = depth + 1.0 | ||
found_feature = true | ||
break | ||
end | ||
end | ||
|
||
# If the tree contained a feature not in the sample then take | ||
# both sides of the tree and average the scores together. | ||
if found_feature == false | ||
left_depth = depth + score_sample_against_tree(sample, current_node.left) | ||
right_depth = depth + score_sample_against_tree(sample, current_node.right) | ||
return (left_depth + right_depth) / 2.0 | ||
end | ||
end | ||
|
||
return depth | ||
end | ||
|
||
# Scores the sample against the entire forest of trees. Result is the average path length. | ||
function score_sample_against_forest(forest::Forest, features::Feature) | ||
num_trees = 0 | ||
avg_path_len = 0.0 | ||
|
||
for tree in forest.trees | ||
path_len = score_sample_against_tree(tree, features) | ||
if path_len > 0 | ||
avg_path_len = avg_path_len + path_len | ||
num_trees = num_trees + 1 | ||
end | ||
end | ||
|
||
if num_trees > 0 | ||
avg_path_len = avg_path_len / num_trees | ||
end | ||
|
||
return avg_path_len | ||
end | ||
|
||
# Scores the sample against the entire forest of trees. Result is normalized so that values | ||
# close to 1 indicate anomalies and values close to zero indicate normal values. | ||
function H(i) | ||
return log(i) + 0.5772156649 | ||
end | ||
function C(n) | ||
return 2 * H(n - 1) - (2 * (n - 1) / n) | ||
end | ||
function score_sample_against_forest_normalized(forest::Forest, features::Feature) | ||
|
||
# Compute the average path length for all valid trees. | ||
num_trees = 0 | ||
avg_path_len = 0.0 | ||
|
||
for tree in forest.trees | ||
path_len = score_sample_against_tree(tree, features) | ||
if path_len > 0 | ||
avg_path_len = avg_path_len + path_len | ||
num_trees = num_trees + 1 | ||
end | ||
end | ||
|
||
if num_trees > 0 | ||
avg_path_len = avg_path_len / num_trees | ||
end | ||
|
||
# Normalize, per the original paper. | ||
score = 0.0 | ||
if num_trees > 1.0 | ||
exponent = -1.0 * (avg_path_len / C(num_trees)) | ||
score = 2 ^ exponent | ||
end | ||
|
||
return score | ||
end | ||
|
||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,18 @@ | ||
module OutlierDetectionTrees | ||
using OutlierDetectionInterface | ||
using OutlierDetectionInterface:SCORE_UNSUPERVISED | ||
const OD = OutlierDetectionInterface | ||
|
||
# Write your package code here. | ||
include("models/IForest.jl") | ||
|
||
const UUID = "6470b2ab-4fe8-498e-808d-6badd5c3da38" | ||
const MODELS = [:IForestDetector] | ||
|
||
for model in MODELS | ||
@eval begin | ||
OD.@default_frontend $model | ||
OD.@default_metadata $model $UUID | ||
export $model | ||
end | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
include("../IsolationForest.jl") | ||
|
||
""" | ||
IForestDetector() | ||
Determine the anomaly score of a sample based on their average path lengths on trees in a forest, see [1]. | ||
Parameters | ||
---------- | ||
$(SCORE_UNSUPERVISED("IForest")) | ||
References | ||
---------- | ||
[1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou. “Isolation Forest.” In 2008 Eighth IEEE International Conference on | ||
Data Mining, 413–22. Pisa, Italy: IEEE, 2008. | ||
""" | ||
OD.@detector mutable struct IForestDetector <: UnsupervisedDetector | ||
num_trees::Int = 100 | ||
sub_sampling_size::Int = 256 | ||
normalize::Bool = false | ||
end | ||
|
||
struct IForestModel <: DetectorModel | ||
forest::IsolationForest.Forest | ||
end | ||
|
||
to_sample_set(X) = eachcol(X) | ||
to_feature_set(X) = map(collect, eachrow(X)) | ||
make_scorer(forest, normalize::Bool) = normalize ? | ||
x -> IsolationForest.score_sample_against_forest_normalized(forest, x) : | ||
x -> IsolationForest.score_sample_against_forest(forest, x) | ||
|
||
function OD.fit(detector::IForestDetector, X::Data; verbosity)::Fit | ||
num_features = size(X, 1) | ||
feature_values = to_feature_set(X) | ||
sample_values = to_sample_set(X) | ||
forest = IsolationForest.Forest(detector.num_trees, detector.sub_sampling_size, num_features, feature_values) | ||
score = make_scorer(forest, detector.normalize) | ||
return IForestModel(forest), score.(sample_values) | ||
end | ||
|
||
function OD.transform(detector::IForestDetector, model::IForestModel, X::Data)::Scores | ||
sample_values = to_sample_set(X) | ||
score = make_scorer(model.forest, detector.normalize) | ||
return score.(sample_values) | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,9 @@ | ||
using OutlierDetectionTrees | ||
using Test | ||
using OutlierDetectionTest | ||
|
||
@testset "OutlierDetectionTrees.jl" begin | ||
# Write your tests here. | ||
end | ||
test_meta.(eval.(OutlierDetectionTrees.MODELS)) | ||
|
||
data = TestData() | ||
run_test(detector) = test_detector(detector, data) | ||
|
||
run_test(IForestDetector()) |