import numpy as np
import matplotlib.pyplot as plt
Anomaly Detection
Anomaly Detection identifies unusual patterns or outliers in data, crucial for detecting deviations from the norm and highlighting potential irregularities or abnormalities.
Import Libraries
Dataset
= np.load("data/X_train.npy")
X_train = np.load("data/X_test.npy")
X_test = np.load("data/y_test.npy") y_test
print(X_train.shape)
print(X_test.shape)
print(y_test.shape)
(307, 2)
(307, 2)
(307,)
0], X_train[:, 1], marker='x', c='b')
plt.scatter(X_train[:,
"The first dataset")
plt.title('Throughput (mb/s)')
plt.ylabel('Latency (ms)')
plt.xlabel(0, 25, 0, 25])
plt.axis([ plt.show()
Gaussian Distribution
def estimate_gaussian(X):
= np.mean(X, axis=0)
mu = np.var(X, axis=0)
var
return mu, var
def univariate_gaussian(X, mu, var):
= (1 / (np.sqrt(2 * np.pi * var))) * np.exp(-((X - mu) ** 2) / (2 * var))
p
return p
def multivariate_gaussian(X, mu, var):
= len(mu)
k
if var.ndim == 1:
= np.diag(var)
var
= X - mu
X = (2 * np.pi)**(-k/2) * np.linalg.det(var)**(-0.5) * \
p -0.5 * np.sum(np.matmul(X, np.linalg.pinv(var)) * X, axis=1))
np.exp(
return p
= estimate_gaussian(X_train)
mu, var print("mu = ", mu)
print("var = ", var)
mu = [14.11222578 14.99771051]
var = [1.83263141 1.70974533]
print(univariate_gaussian(X_train[246][0], mu[0], var[0]))
0.28071685840987737
= np.meshgrid(np.arange(0, 30, 0.5), np.arange(0, 30, 0.5))
X1, X2 = multivariate_gaussian(np.stack([X1.ravel(), X2.ravel()], axis=1), mu, var)
Z = Z.reshape(X1.shape)
Z
0], X_train[:, 1], 'bx')
plt.plot(X_train[:, =10**(np.arange(-20., 1, 3)), linewidths=1)
plt.contour(X1, X2, Z, levels
"The Gaussian contours of the distribution fit to the dataset")
plt.title('Throughput (mb/s)')
plt.ylabel('Latency (ms)')
plt.xlabel( plt.show()
Selecting threshold
def select_threshold(y_val, p_val):
= 0
best_epsilon = 0
best_F1 = 0
F1
= (max(p_val) - min(p_val)) / 1000
step_size
for epsilon in np.arange(min(p_val), max(p_val), step_size):
= (p_val < epsilon)
predictions
= np.sum((predictions == 1) & (y_val == 1))
tp = np.sum((predictions == 1) & (y_val == 0))
fp = np.sum((predictions == 0) & (y_val == 1))
fn
= 0, 0
prec, rec
if (tp+fp) != 0:
= (tp)/(tp+fp)
prec if (tp+fn) != 0:
= (tp)/(tp+fn)
rec
= 0
F1 if (prec+rec) != 0:
= 2*prec*rec/(prec+rec)
F1
if F1 > best_F1:
= F1
best_F1 = epsilon
best_epsilon
return best_epsilon, best_F1
Finding Outliers
= multivariate_gaussian(X_train, mu, var)
p = multivariate_gaussian(X_test, mu, var)
p_val = select_threshold(y_test, p_val)
epsilon, F1
print('Best epsilon found using cross-validation: %e' % epsilon)
print('Best F1 on Cross Validation Set: %f' % F1)
Best epsilon found using cross-validation: 8.990853e-05
Best F1 on Cross Validation Set: 0.875000
= p < epsilon
outliers
= np.meshgrid(np.arange(0, 30, 0.5), np.arange(0, 30, 0.5))
X1, X2 = multivariate_gaussian(np.stack([X1.ravel(), X2.ravel()], axis=1), mu, var)
Z = Z.reshape(X1.shape)
Z
0], X_train[:, 1], 'bx')
plt.plot(X_train[:, =10**(np.arange(-20., 1, 3)), linewidths=1)
plt.contour(X1, X2, Z, levels0], X_train[outliers, 1], 'ro',
plt.plot(X_train[outliers, =10, markerfacecolor='none', markeredgewidth=2)
markersize
"The Gaussian contours of the distribution fit to the dataset")
plt.title('Throughput (mb/s)')
plt.ylabel('Latency (ms)')
plt.xlabel(
plt.show()
print('# Anomalies found: %d' % sum(p < epsilon))
# Anomalies found: 6