suppressWarnings({
suppressPackageStartupMessages({
library(careless)
library(tidyverse)
library(isotree)
library(manydist)
library(stray)
library(RobStatTM)
library(dbscan)
library(e1071)
library(randomForest)
library(outForest)
library(odetector)
library(weird)
}
)})
options(scipen = 9999, digits = 3)Outlier Detection Methods
OUTLIER DETECTION: a simple taxonomy
“It is absurd to divide people into good and bad.
People are either charming or tedious.”
Oscar Wilde
Outlier or anomaly detection algorithms are critical for fraud detection. They represent a first line of defense to identify transactions, behaviors, or data points that deviate significantly from established normal patterns.
Outlier detection algos enable real-time flagging of anomalies—such as unusual credit card spending or unauthorized network access—that traditional, rule-based systems often miss.
set.seed(42)
group1 <- data.frame(x = rnorm(1000, -1, .4),
y = rnorm(1000, -1, .2))
group2 <- data.frame(x = rnorm(1000, +1, .2),
y = rnorm(1000, +1, .4))
X = rbind(group1, group2)
X = rbind(X, c(-1, 1), c(-0.5,0.5), c(0,0))
plot(X, col = c("darkred"), pch = 16)Distance-Based Methods
Euclidean Distance
The Euclidean Distance method for detecting outliers identifies anomalies by calculating the straight-line distance between data points and a central point in a space. The central point would be either the mean or a centroid depending on the space dimensions. The method requires the setting of a threshold. Points with a Euclidean distance greater than the cutoff are flagged as outliers. To provide consistency we will use the 95th percentile as our cutoff point, throughout.
X = scale(X) |> as.data.frame()
dat.center <- colMeans(X[,1:2])
euclidean_dist_to_center <- function(point, center) {
# The formula for Euclidean distance is
# sqrt(sum((x1 - c1)^2 + (x2 - c2)^2 + ...))
sqrt(sum((point - center)^2))
}
X$myeuclid = apply(X, 1,
euclidean_dist_to_center,
center = dat.center )
threshold_quant = quantile(X$myeuclid,0.95)
X$isout = X$myeuclid > threshold_quant
plot(X[,1:2], col = as.factor(X$isout), lwd = 5, pch = 19 )Manhattan Distance
Contrary to the Euclidean’s distance metric - which is the square of the differences - the Manhattan distance takes the absolute differences of the points.
manhattan_dist_to_center <- function(point, center) {
# The formula for Manhattan distance is
# sum(abs(x1 - c1) + abs(x2 - c2) + ...))
sum(abs(point - center))
}
X$myman = apply(X[,1:2], 1,
manhattan_dist_to_center,
center = dat.center )
threshold_quant = quantile(X$myman,0.95)
X$isout = X$myman > threshold_quant
plot(X[,1:2], col = as.factor(X$isout), lwd = 5, pch = 19 )Mahalanobis Distance
The Mahalanobis distance approach to outlier detection differs from the Euclidian distance one by accounting for data covariance, correlations between variables, and varying scales. Euclidean distance measures straight-line distance; Mahalanobis, on the other hand, measures how many standard deviations a point is from the mean.
dat.center <- colMeans(X[,1:2])
dat.cov <- cov(X[,1:2])
X$mdist <- mahalanobis(
x = X[,1:2],
center = dat.center,
cov = dat.cov
)
threshold_quant = quantile(X$mdist,0.99)
X$isout = X$mdist > threshold_quant
plot(X[,1:2], col = as.factor(X$isout), lwd = 5, pch = 19 )ggplot(X, aes(x , y, color = isout)) +
geom_point(size = 3) +
#geom_point(aes(dat.center[1], dat.center[2]) , size = 5 , color = 'blue') +
labs(title = 'Outliers',
subtitle = 'Outlier detection',
caption = "cap",
x = 'x',
y = 'y') +
theme_bw() +
theme(legend.position = "")Stray Scoring
Stray scores represent an observation degree of anomaly. The algorithm uses iterative sampling and k-nearest neighbors to compute these scores. Higher scores indicate a greater probability of being an outlier.
mystray <- find_HDoutliers(X[,1:2],
alpha = 0.35,
p = 0.75,
k = 5)
threshold_stray = quantile(mystray$out_scores, 0.95)
X$isout = mystray$out_scores > threshold_stray
plot(X[,1:2], col = as.factor(X$isout), lwd = 5, pch = 19 )We can dispense with the threhold selection above; the stray scoring algo uses extreme value theory to calculate the threshold for an anomaly score.
X$isout = "FALSE"
X[mystray$outliers,]$isout = "TRUE"
plot(X[,1:2], col = as.factor(X$isout), lwd = 5, pch = 19 )Surprisals
Surprisal reflects the surprise or distinctiveness of some outcome of a random variable. When the outcome is very unlikely the surprisal is high. It is defined as the negative logarithm of the probability of the outcome in question.
Surprisal scores ( surprisal = -log(p)) are preferred over raw probabilities (p) because they linearize exponential differences, transforming tiny, hard-to-distinguish probabilities into usable, additive, and interpretable scales. Surprisal efficiently highlights rare events and is robust to model mis-specification, unlike raw thresholds which struggle with extreme, heavily skewed probabilities.
Here we use mahalanobis distance to obtain the distance of each data point from the mean, scaled by covariance. Squared distances follow a chisquare distribution. We use the chisquare distribution to find the probabilities of each point occurring by chance. It then converts these probabilities to surprisal scores to highlight extreme values. We the threshold at the 95 percentile.
mean_val <- colMeans(X[,1:2])
cov_val <- cov(X[,1:2])
dist_sq <- mahalanobis(X[,1:2], mean_val, cov_val)
# Calculate probability distribution
p_vals <- pchisq(dist_sq, df = 2, lower.tail = FALSE)
prob = surprisals_prob(p_vals, approximation = "gpd")
threshold_surpr = quantile(prob, 0.95)
X$isout = prob < threshold_surpr
table(X$isout)
FALSE TRUE
1803 200
plot(X[,1:2], col = as.factor(X$isout), lwd = 5, pch = 19 )Here we use Z-scores to obtain the distance of each data point from the mean. It then converts these Z-scores to surprisal scores to highlight extreme values. We the 95 percentile as threshold.
z_scores <- as.data.frame(scale(X[,1:2]))
prob = surprisals_prob(z_scores, approximation = "gpd")
threshold_surpr = quantile(prob, 0.95)
X$isout = prob < threshold_surpr
table(X$isout)
FALSE TRUE
1803 200
plot(X[,1:2], col = as.factor(X$isout), lwd = 5, pch = 19 )Glosh Scoring
Glosh scoring detects outliers in data by comparing a point’s local density to the density of the cluster it belongs to, using a hierarchical approach. It is primarily implemented in conjunction with the HDBSCAN clustering algorithm and generates a score for each data point that indicates how likely it is to be an outlier.
rob_cov <- RobStatTM::covRob(X[,1:2])
# Mahalanobis distance using robust estimates
robust_dist <- mahalanobis(X[,1:2],
rob_cov$center,
rob_cov$cov)
# 4. Define Threshold using Chi-Square Distribution
# Points exceeding the 97.5% quantile of Chi-square distribution are outliers
threshold_glosch <- quantile(robust_dist, 0.95)
X$isout <- robust_dist > threshold_glosch
plot(X[,1:2], col = as.factor(X$isout), lwd = 5, pch = 19 )Density-Based Methods
Local Outlier Factor
The Local Outlier Factor (LOF) algorithm is an unsupervised detection approach to identifying outliers in a dataset (Breunig et al., 2000). It does this by computing the local density surrounding a given data point and compares it to the density around other data points.
It considers as outliers the samples that have a substantially lower density than their neighbors.
Unlike global methods, LOF identifies local outliers—points that appear anomalous relative to their local neighborhood, rather than the entire dataset’s density.
One of the appealing properties of LOF is that it is capable of working with mixed variables, numeric and categorical columns.
The LOF algorithm requires the specification of k-nearest neighbors of a data point. It then calculates the distance between the data point and each of its k-nearest neighbors. In turn, the local outlier probability, which ranges from 0 to 1, constitutes a direct measure of the likelihood of the particular point being an outlier. The higher the outlier probability the more likely the data point is to be an outlier. By contrast, a low outlier factor indicates that a data point is more likely to be non-outlier data point. The algorithm is ideal for identifying points that are significantly different from their neighbors such as fraud detection or identifying financial frailty.
The downside of LOF as with all score-based methods is that there needs to be some criteria towards a separation threshold (between inliers and outliers).
lof_scores <- lof(X[,1:2], minPts = 5)
threshold_lof = quantile(lof_scores, 0.95)
X$isout <- lof_scores > threshold_lof
plot(X[,1:2], col = as.factor(X$isout), lwd = 5, pch = 19 )Thresholding: There is no universal threshold for LOF. While scores near 1 represent inliers, a score of 1.5 or 2.0 is often used to flag outliers depending on the data’s density.
Tree Based Methods
Isolation Forests
myiso <- isolation.forest(X)
X$myisopred = predict(myiso, X)
threshold = quantile(X$myisopred,0.99)
X$isout = X$myisopred > threshold
plot(X[,1:2], col = as.factor(X$isout), lwd = 5, pch = 19 ) ###Model-Based Methods
One-Class SVM
oc_svm_model <- svm(X[,1:2],
type = "one-classification",
kernel = "radial",
nu = 0.05,
gamma = 0.1)
SVM_preds <- predict(oc_svm_model, X[,1:2])
X$isout <- !SVM_preds
plot(X[,1:2], col = as.factor(X$isout), lwd = 5, pch = 19 )One-Class Random Forests
One-class or unsupervised random forests are commonly used for outlier detection by measuring how “dissimilar” an observation is from the rest of the data. This is often achieved using proximity matrices.
rf_unsupervised <- randomForest(X[,1:2], proximity = TRUE)
rf_outlier_scores <- randomForest::outlier(rf_unsupervised)
rf_threshold = quantile(rf_outlier_scores, 0.95)
X$isout <- rf_outlier_scores > rf_threshold
plot(X[,1:2], col = as.factor(X$isout), lwd = 5, pch = 19 )out_model <- outForest::outForest(X[,1:2], verbose = 0)
detected_outliers <- outliers(out_model)
X$isout = FALSE
X[detected_outliers$row,]$isout = "TRUE"
plot(X[,1:2], col = as.factor(X$isout), lwd = 5, pch = 19 )Possibilitic Clustering
Possibilistic clustering assigns data points to clusters based on “typicality” (degree of compatibility) rather than probabilistic sharing, making it highly resistant to noise and outliers. Unlike fuzzy clustering, it allows points to have high membership only in clusters they are truly typical of
Perform outlier detection using possibilistic clustering
Possibilistic Fuzzy C-Means Clustering
Adjust parameters like ‘k’ (number of clusters) and ‘m’ (fuzziness parameter) as needed
res.upfc <- ppclust::upfc(scale(X[,1:2]),
m=3,
eta = 3,
centers=3)
out <- detect.outliers(res.upfc, alpha = 0.1)plot(out)outlier_results <- detect.outliers(scale(X[,1:2]),
alpha2 = 0.15,
tsc = "m2")
plot(outlier_results)myindex = data.frame(myind = outlier_results$outliers1)
X$isout = FALSE
X[myindex$myind, ]$isout = "TRUE"
plot(X[,1:2], col = as.factor(X$isout), lwd = 5, pch = 19 )