Skip to content

Commit

Permalink
initial isolation forest
Browse files Browse the repository at this point in the history
  • Loading branch information
davnn committed Mar 19, 2022
1 parent e72e541 commit a6cd8d5
Show file tree
Hide file tree
Showing 6 changed files with 251 additions and 6 deletions.
7 changes: 6 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,16 @@ uuid = "6470b2ab-4fe8-498e-808d-6badd5c3da38"
authors = ["David Muhr <[email protected]> and contributors"]
version = "0.1.0"

[deps]
OutlierDetectionInterface = "1722ece6-f894-4ffc-b6be-6ca1174e2011"

[compat]
julia = "1"
OutlierDetectionInterface = "0.1"

[extras]
OutlierDetectionTest = "66620973-d34b-445b-a614-4040704cad69"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[targets]
test = ["Test"]
test = ["Test", "OutlierDetectionTest"]
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
# OutlierDetectionTrees

[![Documentation (stable)](https://img.shields.io/badge/docs-stable-blue.svg)](https://OutlierDetectionJL.github.io/OutlierDetection.jl/stable)
[![Documentation (dev)](https://img.shields.io/badge/docs-dev-blue.svg)](https://OutlierDetectionJL.github.io/OutlierDetection.jl/dev)
[![Build Status](https://github.com/OutlierDetectionJL/OutlierDetectionTrees.jl/actions/workflows/CI.yml/badge.svg?branch=master)](https://github.com/OutlierDetectionJL/OutlierDetectionTrees.jl/actions/workflows/CI.yml?query=branch%3Amaster)
[![Coverage](https://codecov.io/gh/OutlierDetectionJL/OutlierDetectionTrees.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/OutlierDetectionJL/OutlierDetectionTrees.jl)
176 changes: 176 additions & 0 deletions src/IsolationForest.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
# adapted from LibIsolationForest
# see: https://github.com/msimms/LibIsolationForest/blob/master/julia/IsolationForest.jl
module IsolationForest

Feature = AbstractVector{T} where T <: AbstractFloat
Dataset = AbstractVector{<:Feature}

# Tree node, used internally.
mutable struct Node
featureName::Int
splitValue::AbstractFloat
left::Union{Node, Nothing}
right::Union{Node, Nothing}
end

# Isolation Forest.
mutable struct Forest
numTrees::Int
subSamplingSize::Int
numFeatures::Int
trees::AbstractArray{Union{Node, Nothing}}
function Forest(numTrees, subSamplingSize, numFeatures, featureValues)
forest = new(numTrees, subSamplingSize, numFeatures, [])
for i = 1:forest.numTrees
featureValues = deepcopy(featureValues)
tree = create_tree(forest, featureValues, 0)
if tree !== nothing
push!(forest.trees, tree)
end
end
return forest
end
end

# Creates and returns a single tree. As this is a recursive function, depth indicates the current depth of the recursion.
function create_tree(forest::Forest, feature_values::Dataset, depth::Int)
# Sanity check
if forest.numFeatures <= 1
return nothing
end

# If we've exceeded the maximum desired depth, then stop.
if (forest.subSamplingSize > 0) && (depth >= forest.subSamplingSize)
return nothing
end

# Randomly select a feature.
randomly_selected_feature = rand(1:forest.numFeatures)

# Randomly select a split value.
feature_value_set = feature_values[randomly_selected_feature]
feature_value_set_len = length(feature_value_set)

if feature_value_set_len <= 1
return nothing
end
split_value_index = rand(1:feature_value_set_len)
split_value = feature_value_set[split_value_index]

# Create a tree node to hold the split value.
tree = Node(randomly_selected_feature, split_value, nothing, nothing)

# Create two versions of the feature value set that we just used,
# one for the left side of the tree and one for the right.
temp_feature_values = feature_values

# Create the left subtree.
left_features = feature_value_set[1:split_value_index]

temp_feature_values[randomly_selected_feature] = left_features
tree.left = IsolationForest.create_tree(forest, temp_feature_values, depth + 1)

# Create the right subtree.
if split_value_index + 1 < feature_value_set_len
right_features = feature_value_set[split_value_index + 1:feature_value_set_len]
temp_feature_values[randomly_selected_feature] = right_features
tree.right = IsolationForest.create_tree(forest, temp_feature_values, depth + 1)
end

return tree
end

# Scores the sample against the specified tree.
function score_sample_against_tree(tree::Node, features::Feature)
depth = 0.0
current_node = tree

while current_node !== nothing
found_feature = false

# Find the next feature in the sample.
for (current_feature_name, current_feature_value) in enumerate(features)

# If the current node has the feature in question.
if current_feature_name == current_node.featureName
if current_feature_value < current_node.splitValue
current_node = current_node.left
else
current_node = current_node.right
end

depth = depth + 1.0
found_feature = true
break
end
end

# If the tree contained a feature not in the sample then take
# both sides of the tree and average the scores together.
if found_feature == false
left_depth = depth + score_sample_against_tree(sample, current_node.left)
right_depth = depth + score_sample_against_tree(sample, current_node.right)
return (left_depth + right_depth) / 2.0
end
end

return depth
end

# Scores the sample against the entire forest of trees. Result is the average path length.
function score_sample_against_forest(forest::Forest, features::Feature)
num_trees = 0
avg_path_len = 0.0

for tree in forest.trees
path_len = score_sample_against_tree(tree, features)
if path_len > 0
avg_path_len = avg_path_len + path_len
num_trees = num_trees + 1
end
end

if num_trees > 0
avg_path_len = avg_path_len / num_trees
end

return avg_path_len
end

# Scores the sample against the entire forest of trees. Result is normalized so that values
# close to 1 indicate anomalies and values close to zero indicate normal values.
function H(i)
return log(i) + 0.5772156649
end
function C(n)
return 2 * H(n - 1) - (2 * (n - 1) / n)
end
function score_sample_against_forest_normalized(forest::Forest, features::Feature)

# Compute the average path length for all valid trees.
num_trees = 0
avg_path_len = 0.0

for tree in forest.trees
path_len = score_sample_against_tree(tree, features)
if path_len > 0
avg_path_len = avg_path_len + path_len
num_trees = num_trees + 1
end
end

if num_trees > 0
avg_path_len = avg_path_len / num_trees
end

# Normalize, per the original paper.
score = 0.0
if num_trees > 1.0
exponent = -1.0 * (avg_path_len / C(num_trees))
score = 2 ^ exponent
end

return score
end

end
15 changes: 14 additions & 1 deletion src/OutlierDetectionTrees.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,18 @@
module OutlierDetectionTrees
using OutlierDetectionInterface
using OutlierDetectionInterface:SCORE_UNSUPERVISED
const OD = OutlierDetectionInterface

# Write your package code here.
include("models/IForest.jl")

const UUID = "6470b2ab-4fe8-498e-808d-6badd5c3da38"
const MODELS = [:IForestDetector]

for model in MODELS
@eval begin
OD.@default_frontend $model
OD.@default_metadata $model $UUID
export $model
end
end
end
46 changes: 46 additions & 0 deletions src/models/IForest.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
include("../IsolationForest.jl")

"""
IForestDetector()
Determine the anomaly score of a sample based on their average path lengths on trees in a forest, see [1].
Parameters
----------
$(SCORE_UNSUPERVISED("IForest"))
References
----------
[1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou. “Isolation Forest.” In 2008 Eighth IEEE International Conference on
Data Mining, 413–22. Pisa, Italy: IEEE, 2008.
"""
OD.@detector mutable struct IForestDetector <: UnsupervisedDetector
num_trees::Int = 100
sub_sampling_size::Int = 256
normalize::Bool = false
end

struct IForestModel <: DetectorModel
forest::IsolationForest.Forest
end

to_sample_set(X) = eachcol(X)
to_feature_set(X) = map(collect, eachrow(X))
make_scorer(forest, normalize::Bool) = normalize ?
x -> IsolationForest.score_sample_against_forest_normalized(forest, x) :
x -> IsolationForest.score_sample_against_forest(forest, x)

function OD.fit(detector::IForestDetector, X::Data; verbosity)::Fit
num_features = size(X, 1)
feature_values = to_feature_set(X)
sample_values = to_sample_set(X)
forest = IsolationForest.Forest(detector.num_trees, detector.sub_sampling_size, num_features, feature_values)
score = make_scorer(forest, detector.normalize)
return IForestModel(forest), score.(sample_values)
end

function OD.transform(detector::IForestDetector, model::IForestModel, X::Data)::Scores
sample_values = to_sample_set(X)
score = make_scorer(model.forest, detector.normalize)
return score.(sample_values)
end
11 changes: 7 additions & 4 deletions test/runtests.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
using OutlierDetectionTrees
using Test
using OutlierDetectionTest

@testset "OutlierDetectionTrees.jl" begin
# Write your tests here.
end
test_meta.(eval.(OutlierDetectionTrees.MODELS))

data = TestData()
run_test(detector) = test_detector(detector, data)

run_test(IForestDetector())

0 comments on commit a6cd8d5

Please sign in to comment.