import numpy as np
import matplotlib.pyplot as plt
Anomaly Detection
Import Libraries
Dataset
= np.load("data/X_train.npy")
X_train = np.load("data/X_test.npy")
X_test = np.load("data/y_test.npy") y_test
print(X_train.shape)
print(X_test.shape)
print(y_test.shape)
(307, 2)
(307, 2)
(307,)
0], X_train[:, 1], marker='x', c='b')
plt.scatter(X_train[:,
"The first dataset")
plt.title('Throughput (mb/s)')
plt.ylabel('Latency (ms)')
plt.xlabel(0, 25, 0, 25])
plt.axis([ plt.show()
Gaussian Distribution
\[ p(x ; \mu,\sigma ^2) = \frac{1}{\sqrt{2 \pi \sigma ^2}} \exp \left( - \frac{(x - \mu)^2}{2 \sigma ^2} \right) \]
\[ \text{(Univariate Gaussian Distribution)} \]
\[ p(\mathbf{x}; \boldsymbol{\mu}, \boldsymbol{\Sigma}) = \frac{1}{(2\pi)^{k/2} |\boldsymbol{\Sigma}|^{1/2}} \exp\left(-\frac{1}{2} (\mathbf{x} - \boldsymbol{\mu})^T \boldsymbol{\Sigma}^{-1} (\mathbf{x} - \boldsymbol{\mu})\right) \]
\[ \boldsymbol{\Sigma} = \begin{bmatrix} \sigma_{1}^2 & \sigma_{12} & \sigma_{13} & \ldots & \sigma_{1k} \\ \sigma_{21} & \sigma_{2}^2 & \sigma_{23} & \ldots & \sigma_{2k} \\ \sigma_{31} & \sigma_{32} & \sigma_{3}^2 & \ldots & \sigma_{3k} \\ \vdots & \vdots & \vdots & \ddots & \vdots \\ \sigma_{k1} & \sigma_{k2} & \sigma_{k3} & \ldots & \sigma_{k}^2 \\ \end{bmatrix} \]
\[ \text{(Multivariate Gaussian Distribution)} \]
def estimate_gaussian(X):
= np.mean(X, axis=0)
mu = np.var(X, axis=0)
var
return mu, var
def univariate_gaussian(X, mu, var):
= (1 / (np.sqrt(2 * np.pi * var))) * np.exp(-((X - mu) ** 2) / (2 * var))
p
return p
def multivariate_gaussian(X, mu, var):
= len(mu)
k
if var.ndim == 1:
= np.diag(var)
var
= X - mu
X = (2 * np.pi)**(-k/2) * np.linalg.det(var)**(-0.5) * \
p -0.5 * np.sum(np.matmul(X, np.linalg.pinv(var)) * X, axis=1))
np.exp(
return p
= estimate_gaussian(X_train)
mu, var print("mu = ", mu)
print("var = ", var)
mu = [14.11222578 14.99771051]
var = [1.83263141 1.70974533]
print(univariate_gaussian(X_train[246][0], mu[0], var[0]))
0.28071685840987737
= np.meshgrid(np.arange(0, 30, 0.5), np.arange(0, 30, 0.5))
X1, X2 = multivariate_gaussian(np.stack([X1.ravel(), X2.ravel()], axis=1), mu, var)
Z = Z.reshape(X1.shape)
Z
0], X_train[:, 1], 'bx')
plt.plot(X_train[:, =10**(np.arange(-20., 1, 3)), linewidths=1)
plt.contour(X1, X2, Z, levels
"The Gaussian contours of the distribution fit to the dataset")
plt.title('Throughput (mb/s)')
plt.ylabel('Latency (ms)')
plt.xlabel( plt.show()
Selecting threshold \(\epsilon\)
\[\begin{aligned} prec&=\frac{tp}{tp+fp} \\ rec &=\frac{tp}{tp+fn},\end{aligned}\] \[F_1 = \frac{2\cdot prec \cdot rec}{prec + rec}\]
def select_threshold(y_val, p_val):
= 0
best_epsilon = 0
best_F1 = 0
F1
= (max(p_val) - min(p_val)) / 1000
step_size
for epsilon in np.arange(min(p_val), max(p_val), step_size):
= (p_val < epsilon)
predictions
= np.sum((predictions == 1) & (y_val == 1))
tp = np.sum((predictions == 1) & (y_val == 0))
fp = np.sum((predictions == 0) & (y_val == 1))
fn
= 0, 0
prec, rec
if (tp+fp) != 0:
= (tp)/(tp+fp)
prec if (tp+fn) != 0:
= (tp)/(tp+fn)
rec
= 0
F1 if (prec+rec) != 0:
= 2*prec*rec/(prec+rec)
F1
if F1 > best_F1:
= F1
best_F1 = epsilon
best_epsilon
return best_epsilon, best_F1
Finding Outliers
= multivariate_gaussian(X_train, mu, var)
p = multivariate_gaussian(X_test, mu, var)
p_val = select_threshold(y_test, p_val)
epsilon, F1
print('Best epsilon found using cross-validation: %e' % epsilon)
print('Best F1 on Cross Validation Set: %f' % F1)
Best epsilon found using cross-validation: 8.990853e-05
Best F1 on Cross Validation Set: 0.875000
= p < epsilon
outliers
= np.meshgrid(np.arange(0, 30, 0.5), np.arange(0, 30, 0.5))
X1, X2 = multivariate_gaussian(np.stack([X1.ravel(), X2.ravel()], axis=1), mu, var)
Z = Z.reshape(X1.shape)
Z
0], X_train[:, 1], 'bx')
plt.plot(X_train[:, =10**(np.arange(-20., 1, 3)), linewidths=1)
plt.contour(X1, X2, Z, levels0], X_train[outliers, 1], 'ro',
plt.plot(X_train[outliers, =10, markerfacecolor='none', markeredgewidth=2)
markersize
"The Gaussian contours of the distribution fit to the dataset")
plt.title('Throughput (mb/s)')
plt.ylabel('Latency (ms)')
plt.xlabel(
plt.show()
print('# Anomalies found: %d' % sum(p < epsilon))
# Anomalies found: 6