This commit is contained in:
yuyi@microsoft.com 2020-04-21 23:39:05 +08:00
Родитель 898c88bf84
Коммит adf8c51d53
7 изменённых файлов: 296 добавлений и 13 удалений

Просмотреть файл

@ -1,4 +1,4 @@
from msanomalydetector.spectral_residual import SpectralResidual
from msanomalydetector.util import MAX_RATIO, THRESHOLD, MAG_WINDOW, SCORE_WINDOW
from msanomalydetector.util import MAX_RATIO, THRESHOLD, MAG_WINDOW, SCORE_WINDOW, DetectMode
__all__ = ['SpectralResidual', 'MAX_RATIO', 'THRESHOLD', 'MAG_WINDOW', 'SCORE_WINDOW']
__all__ = ['SpectralResidual', 'MAX_RATIO', 'THRESHOLD', 'MAG_WINDOW', 'SCORE_WINDOW', 'DetectMode']

Просмотреть файл

@ -0,0 +1,65 @@
import numpy as np
cimport numpy as np
import array
import bisect
cpdef float sorted_median(float[:] data, int i, int j):
cdef int n = j - i
cdef int mid
if n == 0:
raise Exception("no median for empty data")
if n % 2 == 1:
return data[i + n // 2]
else:
mid = i + n // 2
return (data[mid - 1] + data[mid])/2
cpdef median_filter(np.ndarray data, int window, bint need_two_end=False):
cdef int w_len = window // 2 * 2 + 1
cdef int t_len = len(data)
cdef float[:] val = array.array('f', [x for x in data])
cdef float[:] ans = array.array('f', [x for x in data])
cdef float[:] cur_windows = array.array('f', [0 for x in range(w_len)])
cdef int delete_id
cdef int add_id
cdef int index
if t_len < w_len:
return ans
for i in range(0, w_len):
index = i
add_id = bisect.bisect_right(cur_windows[:i], val[i])
while index > add_id:
cur_windows[index] = cur_windows[index - 1]
index -= 1
cur_windows[add_id] = data[i]
if i >= w_len // 2 and need_two_end:
ans[i - w_len // 2] = sorted_median(cur_windows, 0, i + 1)
ans[window // 2] = sorted_median(cur_windows, 0, w_len)
for i in range(window // 2 + 1, t_len - window // 2):
delete_id = bisect.bisect_right(cur_windows, val[i - window // 2 - 1]) - 1
index = delete_id
while index < w_len - 1:
cur_windows[index] = cur_windows[index + 1]
index += 1
add_id = bisect.bisect_right(cur_windows[:w_len - 1], val[i + window // 2])
index = w_len - 1
while index > add_id:
cur_windows[index] = cur_windows[index - 1]
index -= 1
cur_windows[add_id] = data[i + window // 2]
ans[i] = sorted_median(cur_windows, 0, w_len)
if need_two_end:
for i in range(t_len - window // 2, t_len):
delete_id = bisect.bisect_right(cur_windows[: w_len], data[i - window // 2 - 1]) - 1
index = delete_id
while index < w_len - 1:
cur_windows[index] = cur_windows[index + 1]
index += 1
w_len -= 1
ans[i] = sorted_median(cur_windows[: w_len], 0, w_len)
return ans

Просмотреть файл

@ -0,0 +1,113 @@
import bisect
import numpy as np
from msanomalydetector._anomaly_kernel_cython import median_filter
# pseudo - code to generate the factors.
# factors = [1]
# for i in range(50):
# if i < 40:
# factors.append(factors[-1] / (1.15 + 0.001 * i))
# else:
# factors.append(factors[-1] / (1.25 + 0.01 * i))
# for i in range(50):
# factors.insert(0, factors[0] * (1.25 + 0.001 * i))
factors = [
184331.62871148242, 141902.71648305038, 109324.12672037778, 84289.9974713784, 65038.57829581667, 50222.84038287002,
38812.08684920403, 30017.081863266845, 23233.035497884553, 17996.15452973242, 13950.50738738947, 10822.736530170265,
8402.745753237783, 6528.939979205737, 5076.93622022219, 3950.92312857758, 3077.042935029268, 2398.318733460069,
1870.7634426365591, 1460.393007522685, 1140.9320371270976, 892.0500681212648, 698.0047481387048, 546.5972968979678,
428.36778753759233, 335.97473532360186, 263.71643275007995, 207.16137686573444, 162.8627176617409, 128.13746472206208,
100.8956415134347, 79.50799173635517, 62.70346351447568, 49.48971074544253, 39.09139869308257, 30.90229145698227,
24.448015393182175, 19.35709849024717, 15.338429865489042, 12.163703303322, 9.653732780414286, 7.667778221139226,
6.095213212352326, 4.8490160798347866, 3.8606815922251485, 3.076240312529999, 2.4531421949999994, 1.9578149999999996,
1.5637499999999998, 1.25, 1.0, 0.8695652173913044, 0.7554867223208555, 0.655804446459076, 0.5687809596349316,
0.4928777813127657, 0.4267340097946024, 0.36914706729636887, 0.3190553736355825, 0.27552277516026125, 0.23772456873189068,
0.20493497304473338, 0.17651591132190647, 0.1519069804835684, 0.13061649224726435, 0.11221348131208278, 0.09632058481723846,
0.08260770567516164, 0.0707863801843716, 0.06060477755511267, 0.051843265658779024, 0.0443104834690419, 0.03783986632710667,
0.03228657536442549, 0.027524787181948417, 0.02344530424356765, 0.019953450420057577, 0.01696721974494692, 0.014415649740821513,
0.012237393667929978, 0.010379468759906684, 0.008796159966022614, 0.0074480609365136455, 0.006301235986898177,
0.00532648857725966, 0.004498723460523362, 0.0037963911059268884, 0.0032010043051660104, 0.002696718032995797,
0.0022699646742388863, 0.0019091376570554135, 0.0011570531254881296, 0.000697019955113331, 0.00041737721863073713,
0.000248438820613534, 0.00014700521929794912, 8.647365841055832e-05, 5.056939088336744e-05, 2.9400808653120604e-05,
1.6994687082728674e-05, 9.767061541798089e-06
]
def calculate_boundary_unit_last(data):
if len(data) == 0:
return 0
calculation_size = len(data) - 1
window = int(min(calculation_size // 3, 512))
trends = np.abs(np.asarray(median_filter(data[:calculation_size], window, need_two_end=True), dtype=float))
unit = max(np.mean(trends), 1.0)
if not np.isfinite(unit):
raise Exception('Not finite unit value')
return unit
def calculate_bounary_unit_entire(data, is_anomaly):
if len(data) == 0:
return []
window = int(min(len(data)//3, 512))
trend_fraction = 0.5
trends = np.abs(np.asarray(median_filter(data, window, need_two_end=True), dtype=float))
valid_trend = [t for a, t in zip(is_anomaly, trends) if not a]
if len(valid_trend) > 0:
average_part = np.mean(valid_trend)
units = trend_fraction * trends + average_part * (1 - trend_fraction)
else:
units = trends
if not np.all(np.isfinite(units)):
raise Exception('Not finite unit values')
return units
def calculate_margin(unit, sensitivity):
def calculate_margin_core(unit, sensitivity):
lb = int(sensitivity)
if lb == sensitivity:
return unit * factors[lb]
return (factors[lb + 1] + (factors[lb] - factors[lb + 1]) * (1 - sensitivity + lb)) * unit
if 0 > sensitivity or sensitivity > 100:
raise Exception('sensitivity should be integer in [0, 100]')
if unit <= 0:
raise Exception('unit should be a positive number')
return calculate_margin_core(unit, sensitivity)
def calculate_anomaly_score(value, expected_value, unit, is_anomaly):
if not is_anomaly:
return 0
distance = np.abs(expected_value - value)
margins = [calculate_margin(unit, i) for i in range(101)][::-1]
lb = bisect.bisect_left(margins, distance)
if lb == 0:
score = 0
else:
a, b = margins[lb-1], margins[lb]
score = lb - 1 + (distance - a) / (b - a)
return score
def calculate_anomaly_scores(values, expected_values, units, is_anomaly):
scores = [calculate_anomaly_score(value, exp, unit, anomaly)
for value, exp, unit, anomaly in zip(values, expected_values, units, is_anomaly)]
return scores

Просмотреть файл

@ -24,36 +24,75 @@ ARISING IN ANY WAY OUT OF THE USE OF THE SOFTWARE CODE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
"""
from msanomalydetector.util import AnomalyId, AnomalyScore, IsAnomaly, Value, Timestamp, EPS, average_filter
import pandas as pd
import numpy as np
from msanomalydetector.util import *
import msanomalydetector.boundary_utils as boundary_helper
from msanomalydetector._anomaly_kernel_cython import median_filter
class SpectralResidual:
def __init__(self, series, threshold, mag_window, score_window):
def __init__(self, series, threshold, mag_window, score_window, sensitivity, detect_mode):
self.__series__ = series
self.__values__ = self.__series__['value'].tolist()
self.__threshold__ = threshold
self.__mag_window = mag_window
self.__score_window = score_window
self.__sensitivity = sensitivity
self.__detect_mode = detect_mode
self.__anomaly_frame = None
def detect(self):
anomaly_scores = self.generate_spectral_score(series=self.__series__['value'].tolist())
if self.__anomaly_frame is None:
self.__anomaly_frame = self.__detect()
return self.__anomaly_frame
return anomaly_frame
def __detect(self):
extended_series = SpectralResidual.extend_series(self.__values__)
mags = self.spectral_residual_transform(extended_series)[:len(self.__series__)]
anomaly_scores = self.generate_spectral_score(mags)
anomaly_frame = pd.DataFrame({Timestamp: self.__series__['timestamp'],
Value: self.__series__['value'],
AnomalyId: list(range(0, len(anomaly_scores))),
Mag: mags,
AnomalyScore: anomaly_scores})
anomaly_frame[IsAnomaly] = np.where(anomaly_frame[AnomalyScore] >= self.__threshold__, True, False)
anomaly_frame.set_index(AnomalyId, inplace=True)
if self.__detect_mode == DetectMode.anomaly_and_margin:
anomaly_frame[ExpectedValue] = self.calculate_expected_value(anomaly_frame[anomaly_frame[IsAnomaly]].index.tolist())
boundary_units = boundary_helper.calculate_bounary_unit_entire(np.asarray(self.__values__),
anomaly_frame[IsAnomaly].values)
anomaly_frame[AnomalyScore] = boundary_helper.calculate_anomaly_scores(
values=anomaly_frame[Value].values,
expected_values=anomaly_frame[ExpectedValue].values,
units=boundary_units,
is_anomaly=anomaly_frame[IsAnomaly].values
)
margins = [boundary_helper.calculate_margin(u, self.__sensitivity) for u in boundary_units]
anomaly_frame[LowerBoundary] = anomaly_frame[ExpectedValue].values - margins
anomaly_frame[UpperBoundary] = anomaly_frame[ExpectedValue].values + margins
anomaly_frame[IsAnomaly] = np.logical_and(anomaly_frame[IsAnomaly].values,
anomaly_frame[LowerBoundary].values <= anomaly_frame[Value].values)
anomaly_frame[IsAnomaly] = np.logical_and(anomaly_frame[IsAnomaly].values,
anomaly_frame[Value].values <= anomaly_frame[UpperBoundary].values)
return anomaly_frame
def generate_spectral_score(self, series):
extended_series = SpectralResidual.extend_series(series)
mag = self.spectral_residual_transform(extended_series)[:len(series)]
ave_mag = average_filter(mag, n=self.__score_window)
def generate_spectral_score(self, mags):
ave_mag = average_filter(mags, n=self.__score_window)
ave_mag[np.where(ave_mag <= EPS)] = EPS
return abs(mag - ave_mag) / ave_mag
raw_scores = abs(mags - ave_mag) / ave_mag
scores = np.clip(raw_scores / 10.0, 0, 1.0)
return scores
def spectral_residual_transform(self, values):
"""
@ -124,3 +163,12 @@ class SpectralResidual:
extension = [SpectralResidual.predict_next(values[-look_ahead - 2:-1])] * extend_num
return values + extension
def calculate_expected_value(self, anomaly_index):
values = deanomaly_entire(self.__values__, anomaly_index)
length = len(values)
fft_coef = np.fft.fft(values)
fft_coef.real = [v if length * 3 / 8 >= i or i >= length * 5 / 8 else 0 for i, v in enumerate(fft_coef.real)]
fft_coef.imag = [v if length * 3 / 8 >= i or i >= length * 5 / 8 else 0 for i, v in enumerate(fft_coef.imag)]
exps = np.fft.ifft(fft_coef)
return exps

Просмотреть файл

@ -23,6 +23,7 @@ IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THE SOFTWARE CODE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
"""
from enum import Enum
import numpy as np
IsAnomaly = "isAnomaly"
@ -30,14 +31,23 @@ AnomalyId = "id"
AnomalyScore = "score"
Value = "value"
Timestamp = "timestamp"
Mag = "mag"
ExpectedValue = "expectedValue"
UpperBoundary = "upperBoundary"
LowerBoundary = "lowerBoundary"
MAX_RATIO = 0.25
EPS = 1e-8
THRESHOLD = 3
THRESHOLD = 0.3
MAG_WINDOW = 3
SCORE_WINDOW = 100
class DetectMode(Enum):
anomaly_only = 'AnomalyOnly'
anomaly_and_margin = 'AnomalyAndMargin'
def average_filter(values, n=3):
"""
Calculate the sliding window average for the give time series.
@ -61,3 +71,35 @@ def average_filter(values, n=3):
res[i] /= (i + 1)
return res
def leastsq(x, y):
n = len(x)
sum_x = np.sum(x)
sum_y = np.sum(y)
sum_xx = np.sum(np.multiply(x, x))
sum_xy = np.sum(np.multiply(x, y))
a = (n * sum_xy - sum_x * sum_y) / (n * sum_xx - sum_x * sum_x)
b = (sum_xx * sum_y - sum_x * sum_xy) / (n * sum_xx - sum_x * sum_x)
return a, b
def deanomaly_entire(values, entire_anomalies):
min_points_to_fit = 4
for idx in entire_anomalies:
step = 1
start = max(idx - step, 0)
end = min(len(values) - 1, idx + step)
fit_values = [(i, values[i]) for i in range(start, end+1) if i not in entire_anomalies]
while len(fit_values) < min_points_to_fit and (start > 0 or end < len(values)-1):
step = step + 2
start = max(idx - step, 0)
end = min(len(values) - 1, idx + step)
fit_values = [(i, values[i]) for i in range(start, end+1) if i not in entire_anomalies]
if len(fit_values) > 1:
x, y = tuple(zip(*fit_values))
a, b = leastsq(x, y)
values[idx] = a * idx + b
return values

Просмотреть файл

@ -1,2 +1,3 @@
Cython>=0.29.2
numpy==1.18.1
pandas==0.25.3

Просмотреть файл

@ -1,10 +1,21 @@
from setuptools import setup, find_packages
from setuptools import setup, find_packages, Extension
from Cython.Build import cythonize
from Cython.Distutils import build_ext
import numpy as np
__version__ = "can't find version.py"
exec(compile(open('version.py').read(),
'version.py', 'exec'))
extensions = [
Extension("msanomalydetector._anomaly_kernel_cython", ["msanomalydetector/_anomaly_kernel_cython.pyx"],
define_macros=[('CYTHON_TRACE', '1')])
]
cmdclass = {'build_ext': build_ext}
install_requires = [
'Cython>=0.29.2',
'numpy==1.18.1',
'pandas==0.25.3'
]
@ -13,9 +24,12 @@ setup(
name="msanomalydetector",
description='Microsoft Anomaly Detector Package Based On Saliency Detection',
packages=find_packages(),
include_dirs=[np.get_include()],
cmdclass=cmdclass,
ext_modules=cythonize(extensions),
version=__version__,
install_requires=install_requires,
requires=['numpy', 'pandas'],
requires=['Cython', 'numpy', 'pandas'],
python_requires='>=3.6.0',
package_data={'': ['*.txt']}
)