Outlier Detection Methods

Author

AE Rodriguez

OUTLIER DETECTION: a simple taxonomy

“Don’t talk unless you can improve the silence.”

J.L. Borges

Outlier or anomaly detection algorithms are critical for fraud detection. They represent a first line of defense to identify transactions, behaviors, or data points that deviate significantly from established normal patterns.

Outlier detection algos enable real-time flagging of anomalies—such as unusual credit card spending or unauthorized network access—that traditional, rule-based systems often miss.

suppressWarnings({
suppressPackageStartupMessages({
library(careless) 
library(tidyverse)
library(isotree)
  library(manydist)
  library(stray)
  library(RobStatTM)
  library(dbscan)
  library(e1071)
  library(randomForest)
  library(outForest)
  library(odetector)
}
)})

options(scipen = 9999, digits = 3)
    set.seed(42)
    group1 <- data.frame(x = rnorm(1000, -1, .4),
                         y = rnorm(1000, -1, .2))
    group2 <- data.frame(x = rnorm(1000, +1, .2),
                         y = rnorm(1000, +1, .4))
    X = rbind(group1, group2)
    
    X = rbind(X, c(-1, 1), c(-0.5,0.5), c(0,0))
    
    
    plot(X)

Distance-Based Methods

Euclidean

dat.center <- colMeans(X[,1:2])

euclidean_dist_to_center <- function(point, center) {
  # The formula for Euclidean distance is 
  # sqrt(sum((x1 - c1)^2 + (x2 - c2)^2 + ...))
  
  sqrt(sum((point - center)^2))
}

    X$myeuclid = apply(X, 1, euclidean_dist_to_center, center = dat.center )
    
    threshold_quant = quantile(X$myeuclid,0.95)
          
          X$isout = X$myeuclid > threshold_quant
          
          
plot(X[,1:2], col = as.factor(X$isout), lwd = 5, pch = 19 )

manhattan_dist_to_center <- function(point, center) {
  # The formula for Euclidean distance is 
  # sqrt(sum((x1 - c1)^2 + (x2 - c2)^2 + ...))
  
  sum(abs(point - center))
}


X$myman = apply(X, 1, manhattan_dist_to_center, center = dat.center )

      threshold_quant = quantile(X$myman,0.95)
          
          X$isout = X$myman > threshold_quant
          
          
plot(X[,1:2], col = as.factor(X$isout), lwd = 5, pch = 19 )

Mahalanobis Distance

          dat.center <- colMeans(X[,1:2])
          dat.cov <- cov(X[,1:2])
          
          X$mdist <- mahalanobis(
            x = X[,1:2],
            center = dat.center,
            cov = dat.cov
          )
          
          
# Cutoff values from chi-square distribution to identify outliers
  threshold_chi <- qchisq(p = 0.95, df = ncol(X[, 1:2]))
  threshold_quant = quantile(X$mdist,0.99)
          
          X$isout = X$mdist > threshold_quant
          
          
plot(X[,1:2], col = as.factor(X$isout), lwd = 5, pch = 19 )

ggplot(X, aes(x , y, color = isout)) +
geom_point(size = 3) +
geom_point(aes(dat.center[1], dat.center[2]) , size = 5 , color = 'blue') +
            labs(title = 'Outliers',
                 subtitle = 'Outlier detection',
                 caption = "cap",
                 x = 'x',
                 y = 'y') +
            theme_bw() +
            theme(legend.position = "")
Warning in geom_point(aes(dat.center[1], dat.center[2]), size = 5, color = "blue"): All aesthetics have length 1, but the data has 2003 rows.
ℹ Please consider using `annotate()` or provide this layer with data containing
  a single row.

Stray Scoring

Stray scores represent an observation degree of anomaly. The algorithm uses iterative sampling and k-nearest neighbors to compute these scores. Higher scores indicate a greater probability of being an outlier.

mystray <- find_HDoutliers(X[,1:2], 
                           alpha = 0.35,
                           p = 0.75,
                           k = 5)
hist(mystray$out_scores)

table(mystray$type)

outlier typical 
      3    2000 
threshold_stray = quantile(mystray$out_scores, 0.95)

  X$isout = mystray$out_scores > threshold_stray
          
          
plot(X[,1:2], col = as.factor(X$isout), lwd = 5, pch = 19 )

Surprisals

Surprisals computes Z-scores to represent standard deviations from the mean. It then converts these to surprisal scores to highlight extreme values, and visualizes the results.

mean_val <- colMeans(X[,1:2])
cov_val <- cov(X[,1:2])
dist_sq <- mahalanobis(X[,1:2], mean_val, cov_val)


# Calculate probability density for each data point
p_vals <- pchisq(dist_sq, df = 2, lower.tail = FALSE)

  X$isout = p_vals < 0.025
          
          
plot(X[,1:2], col = as.factor(X$isout), lwd = 5, pch = 19 )

Glosh Scoring

Glosh scoring detects outliers in data by comparing a point’s local density to the density of the cluster it belongs to, using a hierarchical approach. It is primarily implemented in conjunction with the HDBSCAN clustering algorithm and generates a score for each data point that indicates how likely it is to be an outlier.

rob_cov <- RobStatTM::covRob(X[,1:2]) 
# Mahalanobis distance using robust estimates
robust_dist <- mahalanobis(X[,1:2], 
              rob_cov$center, 
              rob_cov$cov)

# 4. Define Threshold using Chi-Square Distribution
# Points exceeding the 97.5% quantile of Chi-square distribution are outliers
threshold_glosch <- quantile(robust_dist, 0.95)

X$isout <- robust_dist > threshold_glosch

          
          
plot(X[,1:2], col = as.factor(X$isout), lwd = 5, pch = 19 )

Density-Based Methods

Local Outlier Factor

The Local Outlier Factor (LOF) algorithm is an unsupervised detection approach to identifying outliers in a dataset (Breunig et al., 2000). It does this by computing the local density surrounding a given data point and compares it to the density around other data points.

It considers as outliers the samples that have a substantially lower density than their neighbors.

Unlike global methods, LOF identifies local outliers—points that appear anomalous relative to their local neighborhood, rather than the entire dataset’s density.

One of the appealing properties of LOF is that it is capable of working with mixed variables, numeric and categorical columns.

The LOF algorithm requires the specification of k-nearest neighbors of a data point. It then calculates the distance between the data point and each of its k-nearest neighbors. In turn, the local outlier probability, which ranges from 0 to 1, constitutes a direct measure of the likelihood of the particular point being an outlier. The higher the outlier probability the more likely the data point is to be an outlier. By contrast, a low outlier factor indicates that a data point is more likely to be non-outlier data point. The algorithm is ideal for identifying points that are significantly different from their neighbors such as fraud detection or identifying financial frailty.

The downside of LOF as with all score-based methods is that there needs to be some criteria towards a separation threshold (between inliers and outliers).

lof_scores <- lof(X[,1:2], minPts = 5)

threshold_lof = quantile(lof_scores, 0.95)

X$isout <- lof_scores > threshold_lof


plot(X[,1:2], col = as.factor(X$isout), lwd = 5, pch = 19 )

Thresholding: There is no universal threshold for LOF. While scores near 1 represent inliers, a score of 1.5 or 2.0 is often used to flag outliers depending on the data’s density.

Tree Based Methods

Isolation Forests

    myiso <- isolation.forest(X)
    
    
    X$myisopred = predict(myiso, X)
    
           threshold = quantile(X$myisopred,0.99)
          
X$isout = X$myisopred > threshold
          
plot(X[,1:2], col = as.factor(X$isout), lwd = 5, pch = 19 )

          ###

Model-Based Methods

One-Class SVM

oc_svm_model <- svm(X[,1:2], 
                    type = "one-classification", 
                    kernel = "radial", 
                    nu = 0.05, 
                    gamma = 0.1)


SVM_preds <- predict(oc_svm_model, X[,1:2])


X$isout <- !SVM_preds

plot(X[,1:2], col = as.factor(X$isout), lwd = 5, pch = 19 )

One-Class Random Forests

One-class or unsupervised random forests are commonly used for outlier detection by measuring how “dissimilar” an observation is from the rest of the data. This is often achieved using proximity matrices.

rf_unsupervised <- randomForest(X[,1:2], proximity = TRUE)


rf_outlier_scores <- randomForest::outlier(rf_unsupervised)

rf_threshold = quantile(rf_outlier_scores, 0.95)

X$isout <- rf_outlier_scores > rf_threshold

plot(X[,1:2], col = as.factor(X$isout), lwd = 5, pch = 19 )

out_model <- outForest::outForest(X[,1:2], verbose = 0)

detected_outliers <- outliers(out_model)

X[detected_outliers$row,]$isout = "TRUE"

plot(X[,1:2], col = as.factor(X$isout), lwd = 5, pch = 19 )

Possibilitic Clustering

Possibilistic clustering assigns data points to clusters based on “typicality” (degree of compatibility) rather than probabilistic sharing, making it highly resistant to noise and outliers. Unlike fuzzy clustering, it allows points to have high membership only in clusters they are truly typical of

Perform outlier detection using possibilistic clustering

Possibilistic Fuzzy C-Means Clustering

Adjust parameters like ‘k’ (number of clusters) and ‘m’ (fuzziness parameter) as needed

res.upfc <- ppclust::upfc(scale(X[,1:2]), 
                          m=3,
                          eta = 3,
                          centers=3)
out <- detect.outliers(res.upfc, alpha = 0.1)
plot(out)

outlier_results <- detect.outliers(scale(X[,1:2]), 
                                     alpha2 = 0.15, 
                                     tsc = "m2")


plot(outlier_results)

outliers_odetector <- X[outlier_results$outliers1, ]

plot(outliers_odetector)

myindex = data.frame(myind = outlier_results$outliers1)

X$isout = FALSE
X[myindex$myind, ]$isout = "TRUE" 

plot(X[,1:2], col = as.factor(X$isout), lwd = 5, pch = 19 )