Remove useless stuff for libmozdata

This commit is contained in:
calixte 2017-09-28 15:11:43 +02:00
Родитель 86c8ab9d2d
Коммит 27ae1eb889
17 изменённых файлов: 1 добавлений и 1468 удалений

Просмотреть файл

@ -2,16 +2,9 @@ language: python
python:
- "2.7"
- "3.5"
before_install:
- if [[ ${TRAVIS_PYTHON_VERSION%%.*} == '3' ]]; then
sudo apt-get install -qq python3-numpy python3-scipy ;
else
sudo apt-get install -qq python-numpy python-scipy ;
fi
install:
- pip install --upgrade pip
- pip install -r requirements.txt
- pip install -r requirements-spikes.txt
- pip install -r test-requirements.txt
script:
- flake8 .

Просмотреть файл

@ -1,6 +1,5 @@
include VERSION
include requirements.txt
include requirements-spikes.txt
include libmozdata/*.json
recursive-exclude * __pycache__

Просмотреть файл

@ -1 +1 @@
0.1.36
0.1.37

Просмотреть файл

@ -1,30 +0,0 @@
import os
from .connection import Query
from .socorro import SuperSearch
class CrashInfo(SuperSearch):
def __init__(self, paths):
self.info = {}
paths = [paths] if type(paths) == str else paths
queries = []
for path in paths:
queries.append(Query(SuperSearch.URL,
params={'product': 'Firefox',
'topmost_filenames': '~' + os.path.basename(path).lower(),
'_results_number': 0,
'_facets': 'product',
'_facets_size': 1},
handler=self.__handler,
handlerdata=path))
super(CrashInfo, self).__init__(queries=queries)
def get(self):
self.wait()
return self.info
def __handler(self, res, path):
self.info[path] = res['total']

Просмотреть файл

@ -1,149 +0,0 @@
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
import utils
from connection import Query
import socorro
def get_files(uuid, common=None, remove_dup=True):
"""Get the files which appears in a backtrace
Args:
uuid (str): crash uuid
common (list[str]): common part of the different backtraces
remove_dup (bool): if True, remove the duplicate files
Returns:
List[str]: a list of the files which appears in the backtrace
"""
def handler(json, data):
frames = json['json_dump']['threads'][json['crashedThread']]['frames']
n = len(common) if common else -1
if remove_dup:
_files = set()
for frame in frames:
if 'file' in frame:
f = frame['file']
if not remove_dup or f not in _files:
data.append(f)
if remove_dup:
_files.add(f)
if n != -1 and 'function' in frame:
if n <= 1:
break
else:
n -= 1
files = []
socorro.ProcessedCrash(params={'crash_id': uuid}, handler=handler, handlerdata=files).wait()
return files
def get_infos(uuids, fraction=0.3):
"""Get info from different backtraces
Args:
uuids (List[str]): crash uuids
fraction (float): the fraction of all the uuids to look in
Returns:
dict: info about the different backtraces
"""
def handler(json, data):
jd = json['json_dump']
if 'threads' in jd and 'crashedThread' in json:
thread_nb = json['crashedThread']
if thread_nb is not None:
frames = jd['threads'][thread_nb]['frames']
data['cpu_name'] = json['cpu_name']
data['os'] = json['os_pretty_version']
functions = []
for frame in frames:
if 'function' in frame:
functions.append(frame['function'])
bt = tuple(functions)
data['cycles'] = __cycles_detection(functions)
data['functions'] = bt
if 'crash_info' in jd:
data['address'] = jd['crash_info']['address']
base = {'cycles': [],
'functions': None,
'address': '',
'cpu_name': '',
'os': ''}
info = {}
queries = []
for uuid in utils.get_sample(uuids, fraction):
data = base.copy()
info[uuid] = data
queries.append(Query(socorro.ProcessedCrash.URL, params={'crash_id': uuid}, handler=handler, handlerdata=data))
socorro.ProcessedCrash(queries=queries).wait()
return info
def __cycles_detection(funs):
"""Detect if there are some cycle in the backtrace [a,b,c,d,b,c,d,b,c,d...]
Args:
funs (List[str]): functions list
Returns:
list: the different cycles present in the backtrace
"""
# TODO: improve this algorithm (not sure that's a good one)
positions = {}
# we get the function positions in the trace
for i in range(len(funs)):
fun = funs[i]
if fun in positions:
positions[fun].append(i)
else:
positions[fun] = [i]
lengths = {}
for k, v in positions.items():
if len(v) >= 2:
l = v[1] - v[0]
good = True
for i in range(2, len(v)):
if v[i] - v[i - 1] != l:
good = False
break
if good:
if l in lengths:
lengths[l].append((k, v))
else:
lengths[l] = [(k, v)]
cycles = []
for k, v in lengths.items():
l = sorted(v, key=lambda x: x[1][0])
pat = []
container = [l[0][0]]
pos = l[0][1][0]
for i in range(1, len(l)):
_pos = l[i][1][0]
if _pos == pos + 1:
container.append(l[i][0])
pos = _pos
else:
pat.append(tuple(container))
container = [l[i][0]]
pos = _pos
pat.append(tuple(container))
cycles += pat
cycles = tuple(cycles)
return cycles

Просмотреть файл

@ -1,457 +0,0 @@
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
import numpy as np
import os
from bisect import bisect
from dateutil.relativedelta import relativedelta
from datetime import datetime
from . import socorro
from . import utils
from .connection import Query
from . import patchanalysis
from . import config
import matplotlib
matplotlib.use('Agg') # NOQA
import matplotlib.pyplot as plt
def mean(x):
"""Get the mean of the sequence x
Args:
x (list or numpy.array): numbers
Returns:
(float, float): the mean and the standard deviation
"""
l = len(x)
m = np.sum(x) / l
e = np.sqrt(np.sum((x - m) ** 2) / l)
return m, e
def median(x):
"""Get the median of the sequence x
Args:
x (list or numpy.array): numbers
Returns:
(float, float): the median and the interquartile-range
"""
q1, m, q3 = np.percentile(x, [25, 50, 100], interpolation='midpoint')
return m, q3 - q1
def __convert(x):
"""Convert a sequence into a numpy.array
Args:
x (list): numbers
Returns:
(numpy.array): a float64 array
"""
if not isinstance(x, np.ndarray):
return np.asarray(x, dtype=np.float64)
return x
def moving(x, f=mean, coeff=2.0):
"""Get the constant trends of x.
The idea is the following:
- suppose that [x_0, ...,x_{n-1}] are 'constant' piece (constant in trend)
- we've a new value x_n
- we compute f([x_0, ..., x_n]) and get the position (p) and dispersion (d) parameters
- if abs(x_n - p) <= coeff * d then the x_n is added to the constant piece
- else a new piece containing x_n is beginning.
Args:
x (list): numbers
f (func): the fonction to compute the position
coeff (float): a coefficient for the tolerance relative to the dispersion
Returns:
(numpy.array): the smoothed data
"""
x = __convert(x)
pieces = [[0, 0]]
coeff = float(coeff)
l = len(x)
for i in range(1, l):
p, d = f(x[pieces[-1][0]:(i + 1)])
if abs(x[i] - p) <= coeff * d:
pieces[-1][1] = i
else:
pieces.append([i, i])
yp = np.empty(l)
yd = np.empty(l)
pos = 0
for piece in pieces:
p, d = f(x[piece[0]:(piece[1] + 1)])
N = piece[1] - piece[0] + 1
yp[pos:(pos + N)] = p
yd[pos:(pos + N)] = d
pos += N
return yp, yd
def multimoving(x, f=mean, coeff=2.0):
"""Compute all the moving curves in moving the first point from left to right
and for each point, select the position which minimize the dispersion.
Args:
x (list): numbers
f (func): the fonction to compute the position
coeff (float): a coefficient for the tolerance relative to the dispersion
Returns:
(numpy.array): the smoothed data
"""
x = __convert(x)
l = len(x)
ys = np.empty((l, l))
ds = np.empty((l, l))
for i in range(l):
x1 = x[:(i + 1)]
x2 = x[i:]
y1, d1 = moving(x1[::-1], f, coeff)
y2, d2 = moving(x2, f, coeff)
ys[i][:len(y1)] = y1[::-1]
ys[i][len(y1):] = y2[1:]
ds[i][:len(d1)] = d1[::-1]
ds[i][len(d1):] = d2[1:]
y = np.empty(l)
d = np.empty(l)
mins_index = np.argmin(ds, axis=0)
for i in range(l):
y[i] = ys[mins_index[i]][i]
d[i] = ds[mins_index[i]][i]
return y, d
def plot(data, f=mean, coeff=2., multi=True, filename=''):
tr = trends(data, f=f, coeff=coeff, multi=multi)
x = tr['data']
sx = tr['smooth_data']
pieces = tr['pieces']
r = np.arange(len(x))
x1 = [pieces[0][0]]
y1 = [x[pieces[0][0]]]
for piece in pieces:
x1.append(piece[1])
y1.append(sx[piece[1]])
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(r, x, color='blue')
ax.plot(r, sx, color='red')
ax.plot(x1, y1, color='green')
if filename:
fig.savefig(filename)
plt.close()
else:
plt.show()
def trends(data, f=mean, coeff=2., multi=True):
"""Get the global variations of the data
Args:
data (list or dict): the data
f (func): the fonction to compute the position
coeff (float): a coefficient for the tolerance relative to the dispersion
multi (Bool): if True, then apply the multimoving smoothing
Returns:
(list or dict): the global variations of the data
"""
if isinstance(data, dict):
# we sort the data according to the key
sorted_data = sorted(data.items(), key=lambda p: p[0])
x = [y for _, y in sorted_data]
else:
sorted_data = []
x = data
x = __convert(x)
if multi:
m, _ = multimoving(x, f, coeff)
else:
m, _ = moving(x, f, coeff)
diff = np.diff(m)
pieces = [[0, 0, 0]]
for i in range(len(diff)):
dx = diff[i]
last = pieces[-1]
k = i + 1
if dx > 0:
if last[2] >= 0:
last[1:] = k, +1
else:
pieces.append([i, k, +1])
elif dx < 0:
if last[2] <= 0:
last[1:] = k, -1
else:
pieces.append([i, k, -1])
else:
last[1] = k
incr = []
decr = []
inf = float('inf')
for piece in pieces:
p0 = piece[0]
p1 = piece[1]
percent = inf if m[p0] == 0 else round((m[p1] - m[p0]) / m[p0] * 100.)
info = [p0, p1, percent]
if piece[2] == 1:
incr.append(info)
elif piece[2] == -1:
info[2] = abs(info[2])
decr.append(info)
elif p0 == 0:
info[2] = abs(info[2])
decr.append(info)
else:
info[2] = info[2]
incr.append(info)
if sorted_data:
for i in incr:
i[0] = sorted_data[i[0]][0]
i[1] = sorted_data[i[1]][0]
for d in decr:
d[0] = sorted_data[d[0]][0]
d[1] = sorted_data[d[1]][0]
return {'increases': incr, 'decreases': decr, 'pieces': pieces, 'data': x, 'smooth_data': m}
def has_crash_stopped(data, date, threshold=51, f=mean, coeff=2., multi=True):
"""Check if a crash has stopped after a date
Args:
data (list or dict): the data
date (datetime.datetime): the date
threshold (float): the percentage of decrease
f (func): the fonction to compute the position
coeff (float): a coefficient for the tolerance relative to the dispersion
multi (Bool): if True, then apply the multimoving smoothing
Returns:
(Bool): True if the crash has stopped
"""
if len(data) <= 3:
dates = sorted(data.keys())
i = bisect(dates, date)
if i == len(dates):
return 'untimely'
if i == 0:
return 'no'
n1 = data[dates[i - 1]]
n2 = data[dates[i]]
if n1 == 0:
return 'yes' if n2 == 0 else 'no'
percentage = round((n2 - n1) / n1 * 100.)
if percentage <= -threshold:
return 'yes'
else:
tr = trends(data, f=f, coeff=coeff, multi=multi)
one_day = relativedelta(days=1)
for dec in tr['decreases']:
if dec[0] - one_day <= date <= dec[1] + one_day and dec[2] >= threshold:
return 'yes'
return 'no'
def have_crashes_stopped(crashes_info, all_versions, product='Firefox', thresholds={}, path=None):
def get_date(build_id, version, channel):
date = utils.get_date_from_buildid(build_id)
if channel in ['nightly', 'aurora']:
return utils.as_utc(datetime(date.year, date.month, date.day))
else:
major = socorro.ProductVersions.get_major_version(version)
if version in all_versions[channel][major]['versions']:
return all_versions[channel][major]['versions'][version] + relativedelta(days=1)
else:
return None
def handler(json, data):
trend = data['trend']
channel = data['channel']
for info in json['facets']['build_id']:
date = get_date(info['term'], info['facets']['version'][0]['term'], channel)
if date:
trend[date] += info['facets']['cardinality_install_time']['value']
if thresholds:
ts = thresholds.get(data['channel'], -1)
for k, v in trend.items():
if v <= ts:
trend[k] = 0
if len(trend) > 1:
data['stop'] = has_crash_stopped(trend, data['push_date'])
else:
data['stop'] = 'untimely'
if path and data['stop'] == 'no':
signature = data['signature']
bugid = data['bugid']
channel = data['channel']
filename = os.path.join(path, '%s_%s_%s.png' % (signature, bugid, channel))
plot(trend, filename=filename)
def trends_handler(json, data):
channel = data[1]
data = data[0]
for info in json['facets']['build_id']:
date = get_date(info['term'], info['facets']['version'][0]['term'], channel)
if date:
data[date] = 0
queries = []
for info in crashes_info:
d = {}
info['trend'] = d
search_date = socorro.SuperSearch.get_search_date(info['start_date'], info['end_date'])
queries.append(Query(socorro.SuperSearch.URL,
{'product': product,
'version': info.get('versions', None),
'release_channel': info.get('channel', None),
'build_id': info.get('build_id'),
'date': search_date,
'_aggs.build_id': ['_cardinality.install_time', 'version'],
'_facets_size': 100,
'_results_number': 0},
handler=trends_handler, handlerdata=(d, info['channel'])))
socorro.SuperSearch(queries=queries).wait()
queries = []
for info in crashes_info:
search_date = socorro.SuperSearch.get_search_date(info['start_date'], info['end_date'])
queries.append(Query(socorro.SuperSearch.URL,
{'signature': '=' + info['signature'],
'product': product,
'version': info.get('versions', None),
'release_channel': info.get('channel', None),
'build_id': info.get('build_id'),
'date': search_date,
'_aggs.build_id': ['_cardinality.install_time', 'version'],
'_facets_size': 1000,
'_results_number': 0},
handler=handler, handlerdata=info))
socorro.SuperSearch(queries=queries).wait()
def analyze_bugs(bugs, base_versions=None, min_date=None, thresholds=None, minimal_releases=None, minimal_days=None):
if thresholds is None:
chans = utils.get_channels()
thresholds = {chan: int(config.get('Thresholds', chan, -1)) for chan in chans}
if minimal_releases is None:
chans = utils.get_channels()
minimal_releases = {chan: int(config.get('Minimal-Releases', chan, -1)) for chan in chans}
if minimal_days is None:
chans = utils.get_channels()
minimal_days = {chan: int(config.get('Minimal-Days', chan, -1)) for chan in chans}
patch_info = patchanalysis.get_patch_info(bugs, base_versions=base_versions)
all_versions = socorro.ProductVersions.get_all_versions()
tomorrow = utils.get_date_ymd('tomorrow')
today = utils.get_date_ymd('today')
# prepare the data
data = []
untimely = []
for bugid, info in patch_info.items():
for sgn in info['signatures']:
for chan, push_date in info['land'].items():
if not min_date or push_date >= min_date:
d = {'signature': sgn, 'push_date': push_date, 'channel': chan, 'versions': None, 'bugid': bugid}
add_data = False
is_untimely = False
chan_versions = all_versions[chan]
for v in chan_versions.values():
dates = v['dates']
end_date = dates[1] if dates[1] else tomorrow
last_release = None
if dates[0] <= push_date <= end_date:
# we check that we've the correct number of new versions after push_date
minimal_releases_for_chan = minimal_releases[chan]
if minimal_releases_for_chan != -1:
version_dates = sorted(v['versions'].values())
# i is such that v_d[i - 1] <= push_date < v_d[i]
i = bisect(version_dates, push_date)
remainder = len(version_dates) - i
if remainder < minimal_releases_for_chan:
is_untimely = True
break
last_release = version_dates[i + minimal_releases_for_chan - 1]
minimal_days_for_chan = minimal_days[chan]
if minimal_days_for_chan != -1:
if last_release:
if (today - last_release).days >= minimal_days_for_chan:
add_data = True
else:
is_untimely = True
break
elif (today - push_date).days >= minimal_days_for_chan:
add_data = True
else:
is_untimely = True
else:
add_data = True
break
if add_data:
d['start_date'] = dates[0]
d['end_date'] = end_date
if dates[1]:
d['build_id'] = ['>=' + utils.get_buildid_from_date(dates[0]),
'<' + utils.get_buildid_from_date(end_date + relativedelta(days=1))]
else:
d['build_id'] = '>=' + utils.get_buildid_from_date(dates[0])
d['versions'] = v['all']
data.append(d)
elif is_untimely:
d['stop'] = 'untimely'
untimely.append(d)
if data:
have_crashes_stopped(data, all_versions, thresholds=thresholds, path='/tmp')
for d in data + untimely:
pi = patch_info[d['bugid']]
sgn = d['signature']
if 'stops' in pi:
stops = pi['stops']
else:
stops = {}
pi['stops'] = stops
if sgn in stops:
stops[sgn][d['channel']] = d['stop']
else:
stops[sgn] = {d['channel']: d['stop']}
for bugid in bugs:
bugid = str(bugid)
if bugid not in patch_info:
patch_info[bugid] = {}
return patch_info

Просмотреть файл

@ -1,47 +0,0 @@
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
import six
from . import utils
def isweird(addr, cpu_name):
"""Check if a memory address is weird
Args:
addr (str): a memory address
cpu_name (str): cpu name
Returns:
bool: True if the address is weird
"""
if not isinstance(addr, six.string_types):
raise Exception('The memory address must be a string.')
if addr == '0x0':
return True
addr = addr.lower()
# Strip leading zeroes
addr = addr[2:].lstrip('0')
if utils.is64(cpu_name):
if len(addr) <= 8:
val = int(addr, 16)
return val <= 0x10000 # first 64k
elif addr.startswith('ffffffff'):
addr = addr[8:] # 8 == len('ffffffff')
val = int(addr, 16)
return val >= 0xffff0000 # last 64k
else:
val = int(addr, 16)
return val <= 0x10000 or val >= 0xffff0000
def analyze(addrs, cpu_name=None):
"""
"""
# we analyze the end of each address to find if a pattern exists

Просмотреть файл

@ -1,81 +0,0 @@
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
import argparse
import re
from connection import (Connection, Query)
from pprint import pprint
from . import config
class Phonebook(Connection):
"""Mozilla phonebook class
"""
URL = config.get('Phonebook', 'URL', 'https://phonebook.mozilla.org')
SEARCH_URL = URL + '/search.php'
def __init__(self, query='*'):
"""Constructor
Args:
query (Optional[str]): the query to pass to phonebook
"""
# TODO: fix credential problem
super(Phonebook, self).__init__(Phonebook.URL)
self.entries = {}
self.exec_queries(Query(Phonebook.SEARCH_URL, params={'query': query, 'format': 'fligtar'}, handler=self.default_handler, handlerdata=self.entries))
def get(self):
"""Get the phonebook entries (waits for all data)
Returns:
dict: the entries
"""
self.wait()
return self.entries
def get_auth(self):
if self.credentials:
ldap = self.credentials['ldap']
return (ldap['username'], ldap['password'])
return None
def default_handler(self, json, data):
"""Handler to use with the data retrieved from phonebook
Args:
json (dict): json data retrieved from phonebook
data (dict): the container which will receive the data
"""
mail_pattern = re.compile('mail=([^,]*)')
for k, v in json.items():
_manager = None
if 'manager' in v:
manager = v['manager']
if manager and 'dn' in manager:
dn = manager['dn']
m = mail_pattern.match(dn)
if m:
_manager = m.group(1)
if not _manager:
_manager = None
else:
_manager = None
bz_email = v['bugzillaEmail']
if not bz_email:
bz_email = k
data[k] = {'name': v['name'],
'bz_email': bz_email,
'manager': _manager}
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Mozilla\'s phonebook')
parser.add_argument('-q', '--query', action='store', default='*', help='query to pass to phonebook, by default \'*\'')
args = parser.parse_args()
pprint(Phonebook(args.query).get())

Просмотреть файл

@ -1,378 +0,0 @@
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
from matplotlib import ticker
from . import utils
try:
import scipy.stats as stats
SCIPY_ENABLED = True
except ImportError:
SCIPY_ENABLED = False
def ma(x, win):
"""Compute the moving average of x with a window equal to win
Args:
x (numpy.array): data
win (int): window
Returns:
numpy.array: the smoothed data
"""
y = np.ones(win, dtype=np.float64)
i = win - 1
_x = np.convolve(x, y, mode='full')[:-i]
_x[1:i] = _x[1:i] / np.arange(2., win, dtype=np.float64)
_x[i:] = _x[i:] / float(win)
return _x
def normalize(x):
"""Normalize data to have them in interval [0; 1]
Args:
x (numpy.array): data
Returns:
numpy.array: the normalized data
"""
m = np.nanmin(x)
M = np.nanmax(x)
if m != M:
return (x - m) / (M - m)
else:
return x / m
def __get_grubb_lambda(n, alpha):
"""Get the value to use for the Grubb's test
http://www.itl.nist.gov/div898/handbook/eda/section3/eda35h1.htm
Args:
n (int): the number of elements in the sample
alpha (float): the signifiance level
Returns:
float: the critical value to use
"""
if not SCIPY_ENABLED:
raise NotImplementedError('Missing Scipy')
n = float(n)
p = alpha / (2. * n)
t = np.abs(stats.t.ppf(p, n - 2.))
l = (n - 1.) * t / np.sqrt((n - 2. + t ** 2) * n)
return l
def __get_pd_median(data, c=1.):
"""Get the median and the mad of data
Args:
data (numpy.ndarray): the data
Returns:
float, float: the median and the mad
"""
p = np.nanmedian(data)
d = np.nanmedian(np.abs(data - p)) / c # d is the MAD
return p, d
def __get_pd_mean(data, c=1.):
"""Get the mean and the standard deviation of data
Args:
data (numpy.ndarray): the data
Returns:
float, float: the mean and the standard deviation
"""
p = np.nanmean(data)
d = np.nanstd(data) / c
return p, d
def __get_lambda_critical(N, i, alpha):
"""Get lambda for generalized ESD test (http://www.itl.nist.gov/div898/handbook/eda/section3/eda35h3.htm).
Args:
N (int): the number of data in sequence
i (int): the i-th outlier
alpha (float): the signifiance level
Returns:
list[int]: list of the index of outliers
"""
if not SCIPY_ENABLED:
raise NotImplementedError('Missing Scipy')
p = 1. - alpha / (2. * (N - i + 1))
t = stats.t.ppf(p, N - i - 1)
return (N - i) * t / np.sqrt((N - i - 1 + t ** 2) * (N - i + 1))
def generalized_esd(x, r, alpha=0.05, method='mean'):
"""Generalized ESD test for outliers (http://www.itl.nist.gov/div898/handbook/eda/section3/eda35h3.htm).
Args:
x (numpy.ndarray): the data
r (int): max number of outliers
alpha (float): the signifiance level
method (str): 'median' or 'mean'
Returns:
list[int]: list of the index of outliers
"""
x = np.asarray(x, dtype=np.float64)
fn = __get_pd_median if method == 'median' else __get_pd_mean
NaN = float('nan')
outliers = []
N = len(x)
for i in range(1, r + 1):
if np.any(~np.isnan(x)):
m, e = fn(x)
if e != 0.:
y = np.abs(x - m)
j = np.nanargmax(y)
R = y[j]
l = __get_lambda_critical(N, i, alpha)
if R > l * e:
outliers.append(j)
x[j] = NaN
else:
break
else:
break
else:
break
return outliers
def get_spikes(data, alpha=0.05, win=-1, threshold_up=-float('Inf'), threshold_down=+float('Inf'), method='median', plot=False):
"""Get the spikes in data.
The Grubb's test is applyed to determinate if a value is an outlier or not (http://www.itl.nist.gov/div898/handbook/eda/section3/eda35h1.htm)
Args:
data (numpy.ndarray): the data
alpha (float): the signifiance level
win (int): the size of the window to use to compute the parameters
threshold_up (float): the min value to say that a spike is a spike
threshold_down (float): the min value to say that a spike is a spike
method (str): 'median' or 'mean'
plot (Bool): True if a plot is wanted
Returns:
list[int], list[int]: the index of the spikes (up and down)
"""
# TODO:
# It could be interesting to remove the noise in using wavelets.
# And maybe we could use them to detect the outliers too.
if isinstance(data, dict):
data = [i[1] for i in sorted(data.items(), key=lambda p: p[0])]
if isinstance(data, list):
data = np.asarray(data, dtype=np.float64)
fn = __get_pd_median if method == 'median' else __get_pd_mean
_data = np.copy(data)
NaN = float('nan')
spikes_up = []
spikes_down = []
for i in range(3, len(data) + 1):
start = max(0, i - win) if win > 0 else 0
d = _data[start:i]
x = d[-1]
N = d.size - np.isnan(d).sum() # number of non NaN values
m, e = fn(d)
l = __get_grubb_lambda(N, alpha)
th = l * e
if abs(x - m) > th: # Grubb's test
_data[i - 1] = NaN # to ignore this outlier in the future
if x > m + th and x >= threshold_up and x > data[i - 2]:
spikes_up.append(i - 1)
elif x < m - th and x <= threshold_down and x < data[i - 2]:
spikes_down.append(i - 1)
if plot:
fig = plt.figure()
ax = fig.add_subplot(111)
X = np.arange(len(data))
Y = data
ax.plot(X, Y, color='blue')
if spikes_up:
ax.plot(X[spikes_up], Y[spikes_up], 'ro', color='red')
if spikes_down:
ax.plot(X[spikes_down], Y[spikes_down], 'ro', color='green')
plt.show()
return spikes_up, spikes_down
def is_spiking(data, alpha=0.05, win=-1, threshold_up=-float('Inf'), threshold_down=+float('Inf'), method='median', plot=False):
"""Check if the last value is a spike (up).
Args:
data (numpy.ndarray): the data
alpha (float): the signifiance level
win (int): the size of the window to use to compute the parameters
threshold_up (float): the min value to say that a spike is a spike
threshold_down (float): the min value to say that a spike is a spike
method (str): 'median' or 'mean'
plot (Bool): True if a plot is wanted
Returns:
Bool: True if the last value is a spike (up).
"""
up, _ = get_spikes(data, alpha=alpha, win=win, threshold_up=threshold_up, threshold_down=threshold_down, method=method, plot=plot)
return up and up[-1] == len(data) - 1
def is_spiking_ma(data, alpha=2.5, win=7, method='mean', plot=False):
"""Check if the last value is spiking. The trend is removed from the data in using moving average.
Args:
data (numpy.ndarray): the data
alpha (float): the signifiance level
win (int): the size of the window to use to compute the parameters
method (str): 'median' or 'mean'
plot (Bool): True if a plot is wanted
Returns:
str: 'up', 'down' or 'none'.
"""
data = np.asarray(data, dtype=np.float64)
# maybe MAD should be divided by stats.norm.ppf(0.75)
fn = __get_pd_median if method == 'median' else __get_pd_mean
NaN = float('nan')
up = []
down = []
trend = ma(data, win)
noise = data - trend
noise = ma(noise, win)
_noise = np.copy(noise) if plot else None
for i in range(win, len(noise) + 1):
if up and up[-1] == i - 2 and noise[i - 2] < noise[i - 1]:
up.append(i - 1)
noise[i - 1] = NaN
continue
elif down and down[-1] == i - 2 and noise[i - 2] > noise[i - 1]:
down.append(i - 1)
noise[i - 1] = NaN
continue
x = noise[:i]
ax = np.abs(x)
m, e = fn(ax)
if np.abs(ax[-1] - m) > alpha * e:
if x[-1] > 0:
up.append(i - 1)
elif x[-1] < 0:
down.append(i - 1)
if x[-1] != 0:
noise[i - 1] = NaN
for j in range(i - 2, -1, -1):
if (x[-1] > 0 and x[j] < x[j + 1]) or (x[-1] < 0 and x[j] > x[j + 1]):
noise[j] = NaN
else:
break
elif (up and up[-1] == i - 2) or (down and down[-1] == i - 2):
noise[i - 1] = NaN
if plot:
fig = plt.figure()
ax = fig.add_subplot(2, 1, 1)
x = np.arange(len(data))
ax.plot(x, data, color='red')
ax.plot(x, trend, color='blue')
ax.plot(x[up], data[up], 'ro', color='green')
ax.plot(x[down], data[down], 'ro', color='yellow')
ax = fig.add_subplot(2, 1, 2)
ax.plot(x, _noise, color='red')
ax.plot(x, noise, color='blue')
ax.plot(x[up], _noise[up], 'ro', color='green')
ax.plot(x[down], _noise[down], 'ro', color='yellow')
plt.show()
if up and up[-1] == len(data) - 1 and data[-1] > data[-2] and np.max(data[-win:]) == data[-1]:
return 'up'
elif down and down[-1] == len(data) - 1 and data[-1] < data[-2] and np.min(data[-win:]) == data[-1]:
return 'down'
else:
return 'none'
def get_spikes_ma(data, alpha=2.5, win=7, method='mean', plot=False):
"""Get the spikes in data. This function is mainly for debug purpose.
Args:
data (numpy.ndarray): the data
alpha (float): the signifiance level
win (int): the size of the window to use to compute the parameters
method (str): 'median' or 'mean'
plot (Bool): True if a plot is wanted
Returns:
list[int], list[int]: the index of the spikes (up and down)
"""
original = data
if isinstance(data, dict):
data = np.asarray([float(i[1]) for i in sorted(data.items(), key=lambda p: p[0])], dtype=np.float64)
if isinstance(data, list):
data = np.asarray(data, dtype=np.float64)
up = []
down = []
for i in range(win, len(data) + 1):
s = is_spiking_ma(data[:i], alpha=alpha, win=win, method=method)
if s == 'up':
up.append(i - 1)
elif s == 'down':
down.append(i - 1)
if plot:
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
if isinstance(original, dict):
labels = [i for i in sorted(original.keys())]
if isinstance(labels[0], datetime):
# take only the monday
xlabels = []
_labels = []
for i in range(len(labels)):
d = labels[i]
if d.isocalendar()[2] == 1: # we've a monday
xlabels.append(i)
_labels.append(utils.get_date_str(d))
labels = _labels
ax.xaxis.set_major_locator(ticker.FixedLocator(xlabels))
# ax.xaxis.set_minor_locator(ticker.MultipleLocator(1))
ax.xaxis.set_ticklabels(labels, rotation='vertical', fontsize=10)
x = np.arange(len(original))
ax.plot(x, data, color='red')
ax.plot(x[up], data[up], 'ro', color='green')
ax.plot(x[down], data[down], 'ro', color='yellow')
plt.show()
return up, down

Просмотреть файл

@ -31,18 +31,3 @@ URL = https://phonebook.mozilla.org
[Gmail]
credentials =
[Thresholds]
nightly = 5
aurora = 5
beta = 10
release = 50
[Minimal-Releases]
beta = 2
release = 1
[Minimal-Days]
release = 7
nightly = 3
aurora = 3

Просмотреть файл

@ -1 +0,0 @@
scipy>=0.18.0

Просмотреть файл

@ -5,8 +5,6 @@ whatthepatch>=0.0.4
elasticsearch>=2.3.0
python-dateutil>=2.5.2
icalendar>=3.10
numpy>=1.11
matplotlib>=1.5.2
google-api-python-client>=1.5.3
oauth2client>=3.0.0
httplib2>=0.9.2

Просмотреть файл

@ -28,9 +28,6 @@ setup(
author_email='release-mgmt@mozilla.com',
url='https://github.com/mozilla/libmozdata',
install_requires=load_requirements('requirements.txt'),
extras_require={
'spikes': load_requirements('requirements-spikes.txt'),
},
packages=find_packages(exclude=['*.tests', '*.tests.*', 'tests.*', 'tests']),
include_package_data=True,
zip_safe=False,

Просмотреть файл

@ -1,56 +0,0 @@
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
import os
import responses
from libmozdata.CrashInfo import CrashInfo
from libmozdata.socorro import SuperSearch
from tests.auto_mock import MockTestCase
class CrashInfoTest(MockTestCase):
mock_urls = [
SuperSearch.URL,
]
@responses.activate
def test_single(self):
path = 'toolkit/components/terminator/nsterminator.cpp'
ci = CrashInfo(path).get()
self.assertEqual(ci[path], 147022)
@responses.activate
def test_multiple(self):
path1 = 'toolkit/components/terminator/nsterminator.cpp'
path2 = 'gfx/layers/d3d11/textured3d11.cpp'
ci = CrashInfo([path1, path2]).get()
self.assertEqual(ci[path1], 147022)
self.assertEqual(ci[path2], 10322)
@responses.activate
def test_not_lower(self):
path = 'toolkit/components/terminator/nsTerminator.cpp'
ci = CrashInfo(path).get()
ci2 = CrashInfo(path.lower()).get()
self.assertEqual(ci[path], ci2[path.lower()])
@responses.activate
def test_basename(self):
path = 'toolkit/components/terminator/nsTerminator.cpp'
ci = CrashInfo(path).get()
ci2 = CrashInfo(os.path.basename(path)).get()
self.assertEqual(ci[path], ci2[os.path.basename(path)])
@responses.activate
def test_empty_array(self):
self.assertEqual(CrashInfo([]).get(), {})

Просмотреть файл

@ -1,44 +0,0 @@
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
import unittest
from libmozdata.bugzilla import Bugzilla
from libmozdata.socorro import Socorro
from libmozdata.hgmozilla import Mercurial
from libmozdata import dataanalysis
from libmozdata import utils
from tests.auto_mock import MockTestCase
import responses
class DataAnalysisTest(MockTestCase):
mock_urls = [
Bugzilla.URL,
Socorro.CRASH_STATS_URL,
Mercurial.HG_URL,
]
@responses.activate
def test_bug_analysis(self):
base_versions = {'nightly': 54, 'aurora': 53, 'beta': 52, 'release': 51, 'esr': 45}
info = dataanalysis.analyze_bugs(['1270686'],
base_versions=base_versions,
minimal_releases={'nightly': -1, 'aurora': -1, 'beta': 1, 'release': 1},
minimal_days={'nightly': 3, 'aurora': 3, 'beta': -1, 'release': 7})
self.assertEqual(list(info.keys()), ['1270686'])
info = info['1270686']
self.assertEqual(info['affected'], set())
self.assertEqual(info['approval'], {'aurora', 'beta'})
self.assertEqual(info['land']['aurora'], utils.get_date_ymd('2016-05-31 14:20:34'))
self.assertEqual(info['land']['beta'], utils.get_date_ymd('2016-05-26 21:02:09'))
self.assertEqual(info['land']['nightly'], utils.get_date_ymd('2016-05-31 10:00:19'))
self.assertEqual(info['signatures'], ['TppTimerpExecuteCallback'])
stops = info['stops']['TppTimerpExecuteCallback']
self.assertEqual(stops['aurora'], 'yes')
self.assertEqual(stops['beta'], 'yes')
self.assertEqual(stops['nightly'], 'yes')
if __name__ == '__main__':
unittest.main()

Просмотреть файл

@ -1,38 +0,0 @@
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
import unittest
from libmozdata import memory
class MemoryTest(unittest.TestCase):
def test_isweird(self):
self.assertTrue(memory.isweird('0x0', '64'))
self.assertTrue(memory.isweird('0x0', '32'))
self.assertTrue(memory.isweird('0xAAA', '64'))
self.assertTrue(memory.isweird('0xAAA', '32'))
self.assertTrue(memory.isweird('0xaaa', '64'))
self.assertTrue(memory.isweird('0xaaa', '32'))
self.assertTrue(memory.isweird('0xffff', '64'))
self.assertTrue(memory.isweird('0xffff', '32'))
self.assertFalse(memory.isweird('0xdeadbeef', '64'))
self.assertFalse(memory.isweird('0xdeadbeef', '32'))
self.assertFalse(memory.isweird('0xAAAAAAAAAAAB0000', '64'))
self.assertFalse(memory.isweird('0xfffffffffffB0000', '64'))
self.assertTrue(memory.isweird('0xffffffffffff0000', '64'))
self.assertTrue(memory.isweird('0xffff0000', '32'))
self.assertTrue(memory.isweird('0xffffffffffffaaaa', '64'))
self.assertTrue(memory.isweird('0xffffaaaa', '32'))
self.assertTrue(memory.isweird('0x000000000000ffff', '64'))
self.assertTrue(memory.isweird('0x0000ffff', '32'))
with self.assertRaises(Exception):
self.assertTrue(memory.isweird(None, '64'))
with self.assertRaises(Exception):
self.assertTrue(memory.isweird(42, '32'))
if __name__ == '__main__':
unittest.main()

Просмотреть файл

@ -1,158 +0,0 @@
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
import numpy as np
import unittest
from libmozdata import spikeanalysis
from libmozdata import utils
from tests.auto_mock import MockTestCase
import responses
import requests
class SpikeAnalysisTest(MockTestCase):
mock_urls = ['https://crash-analysis.mozilla.com/rkaiser/']
def test_get_spikes_1(self):
x = np.zeros(1000, dtype=np.float64)
spx = [30, 100, 250, 750]
spy = [5678, 7123, 4123, 6183]
x[spx] += spy
spikes, _ = spikeanalysis.get_spikes(x, alpha=0.01, method='mean', plot=False)
self.assertEqual(spikes, spx)
x += 500. * np.sin(np.arange(1000, dtype=np.float64))
spikes, _ = spikeanalysis.get_spikes(x, alpha=0.01, method='mean', plot=False)
self.assertEqual(spikes, spx)
spikes, _ = spikeanalysis.get_spikes(x, alpha=0.01, method='median', plot=False)
self.assertEqual(spikes, spx)
@responses.activate
def test_get_spikes_2(self):
url = 'https://crash-analysis.mozilla.com/rkaiser/Firefox-beta-crashes-categories.json'
response = requests.get(url)
data = response.json()
x = {}
max_date = utils.get_date_ymd('2016-09-09')
for k, v in data.items():
if 'startup' in v:
s = v['startup']
date = utils.get_date_ymd(k)
if date <= max_date:
x[date] = s.get('browser', 0)
up, down = spikeanalysis.get_spikes(x, alpha=0.005, win=120, method='median', plot=False)
expected_up = [45, 46, 47, 48, 166, 169, 173, 175, 220, 221, 222, 301, 302, 346, 347, 348, 349, 355, 359, 362, 363, 366, 369, 371, 383, 384, 386, 387, 390, 391, 397, 421, 422, 423, 425, 426, 432, 434, 460, 474, 514, 533, 535, 659, 661, 690, 693, 933, 934, 935, 937, 941, 944, 945, 948]
expected_down = [53, 332, 333, 335, 336, 337, 340, 373, 374, 375, 377, 379, 546, 547, 548, 554, 555, 560, 561, 562, 567, 568, 569, 575, 576, 726, 728]
self.assertEqual(up, expected_up)
self.assertEqual(down, expected_down)
@responses.activate
def test_is_spiking_1(self):
url = 'https://crash-analysis.mozilla.com/rkaiser/Firefox-beta-crashes-categories.json'
response = requests.get(url)
data = response.json()
x1 = {}
x2 = {}
max_date1 = utils.get_date_ymd('2016-09-09')
max_date2 = utils.get_date_ymd('2016-08-05')
for k, v in data.items():
if 'startup' in v:
s = v['startup']
date = utils.get_date_ymd(k)
if date <= max_date1:
x1[date] = s.get('browser', 0)
if date <= max_date2:
x2[date] = s.get('browser', 0)
isp = spikeanalysis.is_spiking(x1, alpha=0.005, win=120, method='median', plot=False)
self.assertFalse(isp)
isp = spikeanalysis.is_spiking(x2, alpha=0.005, win=120, method='median', plot=False)
self.assertTrue(isp)
@responses.activate
def test_is_spiking_2(self):
# values for signature nsFileStreamBase::Write from 2016-09-05 to 2016-09-13
x = [2, 6, 6, 9, 8, 3, 2, 160, 81742]
isp = spikeanalysis.is_spiking(x, alpha=0.01, win=-1, method='mean', plot=False)
self.assertTrue(isp)
@responses.activate
def test_is_spiking_ma(self):
url = 'https://crash-analysis.mozilla.com/rkaiser/Firefox-beta-crashes-categories.json'
response = requests.get(url)
data = response.json()
x1 = {}
x2 = {}
max_date1 = utils.get_date_ymd('2016-09-09')
max_date2 = utils.get_date_ymd('2016-08-05')
for k, v in data.items():
if 'startup' in v:
s = v['startup']
date = utils.get_date_ymd(k)
if date <= max_date1:
x1[date] = s.get('browser', 0)
if date <= max_date2:
x2[date] = s.get('browser', 0)
y1 = [p[1] for p in sorted(x1.items(), key=lambda p: p[0])]
y2 = [p[1] for p in sorted(x2.items(), key=lambda p: p[0])]
isp = spikeanalysis.is_spiking_ma(y1, alpha=2.5, win=7, method='mean', plot=False)
self.assertEqual(isp, 'none')
isp = spikeanalysis.is_spiking_ma(y2, alpha=2.5, win=7, method='mean', plot=False)
self.assertEqual(isp, 'up')
@responses.activate
def test_get_spikes_ma(self):
# this test is quite long
url = 'https://crash-analysis.mozilla.com/rkaiser/Firefox-beta-crashes-categories.json'
response = requests.get(url)
data = response.json()
x1 = {}
max_date1 = utils.get_date_ymd('2016-10-02')
for k, v in data.items():
if 'startup' in v:
s = v['startup']
date = utils.get_date_ymd(k)
if date <= max_date1:
x1[date] = s.get('browser', 0) + s.get('content', 0) + s.get('plugin', 0)
y1 = [p[1] for p in sorted(x1.items(), key=lambda p: p[0])]
up, down = spikeanalysis.get_spikes_ma(y1, alpha=2.5, win=7, method='mean', plot=False)
expected_up = [46, 47, 48, 82, 83, 84, 124, 125, 126, 127, 163, 164, 166, 169, 210, 220, 221, 222, 252, 253, 301, 302, 342, 343, 345, 346, 347, 348, 349, 383, 384, 386, 387, 422, 423, 425, 426, 460, 474, 512, 514, 533, 535, 659, 661, 689, 690, 734, 738, 757, 759, 760, 761, 852, 933, 934, 935, 944, 945]
expected_down = [53, 60, 109, 115, 137, 144, 178, 179, 199, 200, 228, 234, 235, 308, 310, 311, 312, 332, 333, 335, 336, 337, 357, 358, 365, 373, 374, 375, 400, 401, 402, 403, 430, 431, 438, 541, 544, 545, 546, 547, 548, 667, 714, 715, 716, 726, 784, 785, 786, 916, 952, 953, 954, 960, 961]
self.assertEqual(up, expected_up)
self.assertEqual(down, expected_down)
def test_generalized_esd(self):
x = [10, 11, 9, 10, 8, 9, 12, 11, 13]
outliers = spikeanalysis.generalized_esd(x, 5)
self.assertEqual(outliers, [])
x = [10, 11, 9, 10, 21, 9, 12, 11, 13]
outliers = spikeanalysis.generalized_esd(x, 5)
self.assertEqual(outliers, [4])
x = [10, 11, 9, 10, 21, 9, 12, 11, 28]
outliers = spikeanalysis.generalized_esd(x, 5)
self.assertEqual(outliers, [8, 4])
x = []
outliers = spikeanalysis.generalized_esd(x, 5)
self.assertEqual(outliers, [])
if __name__ == '__main__':
unittest.main()