suppressWarnings({
suppressPackageStartupMessages({
library(careless)
library(tidyverse)
library(isotree)
library(manydist)
library(stray)
library(RobStatTM)
library(dbscan)
library(e1071)
library(randomForest)
library(outForest)
library(odetector)
}
)})
options(scipen = 9999, digits = 3)Outlier Detection Methods
OUTLIER DETECTION: a simple taxonomy
“Don’t talk unless you can improve the silence.”
J.L. Borges
Outlier or anomaly detection algorithms are critical for fraud detection. They represent a first line of defense to identify transactions, behaviors, or data points that deviate significantly from established normal patterns.
Outlier detection algos enable real-time flagging of anomalies—such as unusual credit card spending or unauthorized network access—that traditional, rule-based systems often miss.
set.seed(42)
group1 <- data.frame(x = rnorm(1000, -1, .4),
y = rnorm(1000, -1, .2))
group2 <- data.frame(x = rnorm(1000, +1, .2),
y = rnorm(1000, +1, .4))
X = rbind(group1, group2)
X = rbind(X, c(-1, 1), c(-0.5,0.5), c(0,0))
plot(X)Distance-Based Methods
Euclidean
dat.center <- colMeans(X[,1:2])
euclidean_dist_to_center <- function(point, center) {
# The formula for Euclidean distance is
# sqrt(sum((x1 - c1)^2 + (x2 - c2)^2 + ...))
sqrt(sum((point - center)^2))
}
X$myeuclid = apply(X, 1, euclidean_dist_to_center, center = dat.center )
threshold_quant = quantile(X$myeuclid,0.95)
X$isout = X$myeuclid > threshold_quant
plot(X[,1:2], col = as.factor(X$isout), lwd = 5, pch = 19 )manhattan_dist_to_center <- function(point, center) {
# The formula for Euclidean distance is
# sqrt(sum((x1 - c1)^2 + (x2 - c2)^2 + ...))
sum(abs(point - center))
}
X$myman = apply(X, 1, manhattan_dist_to_center, center = dat.center )
threshold_quant = quantile(X$myman,0.95)
X$isout = X$myman > threshold_quant
plot(X[,1:2], col = as.factor(X$isout), lwd = 5, pch = 19 )Mahalanobis Distance
dat.center <- colMeans(X[,1:2])
dat.cov <- cov(X[,1:2])
X$mdist <- mahalanobis(
x = X[,1:2],
center = dat.center,
cov = dat.cov
)
# Cutoff values from chi-square distribution to identify outliers
threshold_chi <- qchisq(p = 0.95, df = ncol(X[, 1:2]))
threshold_quant = quantile(X$mdist,0.99)
X$isout = X$mdist > threshold_quant
plot(X[,1:2], col = as.factor(X$isout), lwd = 5, pch = 19 )ggplot(X, aes(x , y, color = isout)) +
geom_point(size = 3) +
geom_point(aes(dat.center[1], dat.center[2]) , size = 5 , color = 'blue') +
labs(title = 'Outliers',
subtitle = 'Outlier detection',
caption = "cap",
x = 'x',
y = 'y') +
theme_bw() +
theme(legend.position = "")Warning in geom_point(aes(dat.center[1], dat.center[2]), size = 5, color = "blue"): All aesthetics have length 1, but the data has 2003 rows.
ℹ Please consider using `annotate()` or provide this layer with data containing
a single row.
Stray Scoring
Stray scores represent an observation degree of anomaly. The algorithm uses iterative sampling and k-nearest neighbors to compute these scores. Higher scores indicate a greater probability of being an outlier.
mystray <- find_HDoutliers(X[,1:2],
alpha = 0.35,
p = 0.75,
k = 5)
hist(mystray$out_scores)table(mystray$type)
outlier typical
3 2000
threshold_stray = quantile(mystray$out_scores, 0.95)
X$isout = mystray$out_scores > threshold_stray
plot(X[,1:2], col = as.factor(X$isout), lwd = 5, pch = 19 )Surprisals
Surprisals computes Z-scores to represent standard deviations from the mean. It then converts these to surprisal scores to highlight extreme values, and visualizes the results.
mean_val <- colMeans(X[,1:2])
cov_val <- cov(X[,1:2])
dist_sq <- mahalanobis(X[,1:2], mean_val, cov_val)
# Calculate probability density for each data point
p_vals <- pchisq(dist_sq, df = 2, lower.tail = FALSE)
X$isout = p_vals < 0.025
plot(X[,1:2], col = as.factor(X$isout), lwd = 5, pch = 19 )Glosh Scoring
Glosh scoring detects outliers in data by comparing a point’s local density to the density of the cluster it belongs to, using a hierarchical approach. It is primarily implemented in conjunction with the HDBSCAN clustering algorithm and generates a score for each data point that indicates how likely it is to be an outlier.
rob_cov <- RobStatTM::covRob(X[,1:2])
# Mahalanobis distance using robust estimates
robust_dist <- mahalanobis(X[,1:2],
rob_cov$center,
rob_cov$cov)
# 4. Define Threshold using Chi-Square Distribution
# Points exceeding the 97.5% quantile of Chi-square distribution are outliers
threshold_glosch <- quantile(robust_dist, 0.95)
X$isout <- robust_dist > threshold_glosch
plot(X[,1:2], col = as.factor(X$isout), lwd = 5, pch = 19 )Density-Based Methods
Local Outlier Factor
The Local Outlier Factor (LOF) algorithm is an unsupervised detection approach to identifying outliers in a dataset (Breunig et al., 2000). It does this by computing the local density surrounding a given data point and compares it to the density around other data points.
It considers as outliers the samples that have a substantially lower density than their neighbors.
Unlike global methods, LOF identifies local outliers—points that appear anomalous relative to their local neighborhood, rather than the entire dataset’s density.
One of the appealing properties of LOF is that it is capable of working with mixed variables, numeric and categorical columns.
The LOF algorithm requires the specification of k-nearest neighbors of a data point. It then calculates the distance between the data point and each of its k-nearest neighbors. In turn, the local outlier probability, which ranges from 0 to 1, constitutes a direct measure of the likelihood of the particular point being an outlier. The higher the outlier probability the more likely the data point is to be an outlier. By contrast, a low outlier factor indicates that a data point is more likely to be non-outlier data point. The algorithm is ideal for identifying points that are significantly different from their neighbors such as fraud detection or identifying financial frailty.
The downside of LOF as with all score-based methods is that there needs to be some criteria towards a separation threshold (between inliers and outliers).
lof_scores <- lof(X[,1:2], minPts = 5)
threshold_lof = quantile(lof_scores, 0.95)
X$isout <- lof_scores > threshold_lof
plot(X[,1:2], col = as.factor(X$isout), lwd = 5, pch = 19 )Thresholding: There is no universal threshold for LOF. While scores near 1 represent inliers, a score of 1.5 or 2.0 is often used to flag outliers depending on the data’s density.
Tree Based Methods
Isolation Forests
myiso <- isolation.forest(X)
X$myisopred = predict(myiso, X)
threshold = quantile(X$myisopred,0.99)
X$isout = X$myisopred > threshold
plot(X[,1:2], col = as.factor(X$isout), lwd = 5, pch = 19 ) ###Model-Based Methods
One-Class SVM
oc_svm_model <- svm(X[,1:2],
type = "one-classification",
kernel = "radial",
nu = 0.05,
gamma = 0.1)
SVM_preds <- predict(oc_svm_model, X[,1:2])
X$isout <- !SVM_preds
plot(X[,1:2], col = as.factor(X$isout), lwd = 5, pch = 19 )One-Class Random Forests
One-class or unsupervised random forests are commonly used for outlier detection by measuring how “dissimilar” an observation is from the rest of the data. This is often achieved using proximity matrices.
rf_unsupervised <- randomForest(X[,1:2], proximity = TRUE)
rf_outlier_scores <- randomForest::outlier(rf_unsupervised)
rf_threshold = quantile(rf_outlier_scores, 0.95)
X$isout <- rf_outlier_scores > rf_threshold
plot(X[,1:2], col = as.factor(X$isout), lwd = 5, pch = 19 )out_model <- outForest::outForest(X[,1:2], verbose = 0)
detected_outliers <- outliers(out_model)
X[detected_outliers$row,]$isout = "TRUE"
plot(X[,1:2], col = as.factor(X$isout), lwd = 5, pch = 19 )Possibilitic Clustering
Possibilistic clustering assigns data points to clusters based on “typicality” (degree of compatibility) rather than probabilistic sharing, making it highly resistant to noise and outliers. Unlike fuzzy clustering, it allows points to have high membership only in clusters they are truly typical of
Perform outlier detection using possibilistic clustering
Possibilistic Fuzzy C-Means Clustering
Adjust parameters like ‘k’ (number of clusters) and ‘m’ (fuzziness parameter) as needed
res.upfc <- ppclust::upfc(scale(X[,1:2]),
m=3,
eta = 3,
centers=3)
out <- detect.outliers(res.upfc, alpha = 0.1)plot(out)outlier_results <- detect.outliers(scale(X[,1:2]),
alpha2 = 0.15,
tsc = "m2")
plot(outlier_results)outliers_odetector <- X[outlier_results$outliers1, ]
plot(outliers_odetector)myindex = data.frame(myind = outlier_results$outliers1)
X$isout = FALSE
X[myindex$myind, ]$isout = "TRUE"
plot(X[,1:2], col = as.factor(X$isout), lwd = 5, pch = 19 )