created custom manager and queryset for stats to support common summaries

This also includes a StatDictField that converts serialized php into a summable dictionary
This commit is contained in:
Scott McCammon 2010-02-05 20:35:26 -08:00
Родитель 2d8aecb3e4
Коммит 0f0530630e
5 изменённых файлов: 653 добавлений и 9 удалений

342
apps/stats/db.py Normal file
Просмотреть файл

@ -0,0 +1,342 @@
from datetime import date, timedelta
import phpserialize as php
from django.db import models
import caching.base
# Common date helpers
# These all take a date or datetime and return a date.
def prev_month(d):
"""Determine the start date of the previous month."""
yr, mo = divmod(d.year * 12 + d.month - 2, 12)
return date(yr, mo + 1, 1)
def prev_week(d):
"""Determine the start date of the previous week."""
ws = d - timedelta(d.weekday() + 7)
return date(ws.year, ws.month, ws.day)
def prev_day(d):
"""Determine the previous day."""
pd = d - timedelta(1)
return date(pd.year, pd.month, pd.day)
def start_of_month(d):
"""Determine the start of the month for a date or datetime."""
return date(d.year, d.month, 1)
def start_of_week(d):
"""Determine the start of the week for a date or datetime."""
ws = d - timedelta(d.weekday())
return date(ws.year, ws.month, ws.day)
def start_of_day(d):
"""Determine the start of the day for a date or datetime."""
return date(d.year, d.month, d.day)
class StatsQuerySet(caching.base.CachingQuerySet):
def __init__(self, *args, **kwargs):
super(StatsQuerySet, self).__init__(*args, **kwargs)
self._stats_date_field = kwargs['model'].stats_date_field
def summary(self, *fields, **kwargs):
"""Summarizes the entire queryset.
Arguments should be the names of summable fields found in the queryset.
Fields may be renamed in the results by using named arguments:
>>> qs.summary('swallows', dead_parrots='parrots')
{'row_count': 1, 'start': None, 'swallows': 10, 'dead_parrots': 7}
"""
fields = self._map_fields(*fields, **kwargs)
summary = self.zero_summary(None, **fields)
for obj in self:
summary = self._accumulate(obj, summary, **fields)
return summary
def daily_summary(self, *fields, **kwargs):
"""Generate daily/weekly/monthly summaries on the queryset.
The queryset must be in reverse chronological order.
Recognized keyword arguments:
fill_holes
If True, create zero count summaries for periods in the
middle of the queryset that contain no records
All other arguments should be the names of summable fields found
in the queryset. Fields may be renamed in the results by using
named arguments, for example:
>>> [s for s in q.daily_summary('swallows', dead_parrots='parrots')]
[{'row_count': 1, 'start': date(2009, 5, 3), 'swallows': 0,
'dead_parrots': 2},
{'row_count': 1, 'start': date(2009, 5, 2), 'swallows': 10,
'dead_parrots': 5}]
"""
fill_holes = kwargs.pop('fill_holes', False)
fields = self._map_fields(*fields, **kwargs)
return self._summary_iter(fields, fill_holes=fill_holes)
def weekly_summary(self, *fields, **kwargs):
fill_holes = kwargs.pop('fill_holes', False)
fields = self._map_fields(*fields, **kwargs)
return self._summary_iter(fields, fill_holes=fill_holes,
previous_date=prev_week, format_date=start_of_week)
weekly_summary.__doc__ = daily_summary.__doc__
def monthly_summary(self, *fields, **kwargs):
fill_holes = kwargs.pop('fill_holes', False)
fields = self._map_fields(*fields, **kwargs)
return self._summary_iter(fields, fill_holes=fill_holes,
previous_date=prev_month, format_date=start_of_month)
monthly_summary.__doc__ = daily_summary.__doc__
def zero_summary(self, start_date, **fields):
"""Returns a dictionary of 0 values for specified fields.
>>> qs.zero_summary(date(2009, 1, 1), count_total='ignored')
{'start': date(2009, 1, 1), 'row_count': 0, 'count_total': 0}
"""
res = {}
for res_key, fname in fields.items():
# handle special summary fields
if fname == 'start':
res[res_key] = start_date
continue
elif fname == 'row_count':
res[res_key] = 0
continue
# handle regular model fields
field = self.model._meta.get_field_by_name(fname)[0]
if isinstance(field, StatsDictField):
# an empty summable dictionary
res[res_key] = StatsDict()
else:
# everything else starts at 0 whether it can be summed or not
res[res_key] = 0
return res
def _accumulate(self, obj, ac, **fields):
"""Accumulates (sums) field values of an object.
Example:
>>> ac = {'start': date(2009, 1, 1), 'row_count': 2, 'sum': 10}
>>> someobj.count = 4
>>> qs._accumulate(someobj, ac, sum='count')
{'start': date(2009, 1, 1), 'row_count': 3, 'count_sum': 14}
"""
for ac_key, field in fields.items():
# handle special summary fields
if field == 'row_count':
ac[ac_key] += 1
continue
elif field == 'start':
continue
# handle regular model fields
try:
# works with numbers and StatsDict
ac[ac_key] = ac[ac_key] + getattr(obj, field)
except TypeError:
# anything else will keep the initial value (probably 0)
pass
return ac
def _map_fields(self, *fields, **kwargs):
"""Make a map of result key names to model field names.
Named arguments take precedence, for example:
>>> qs._map_fields('a', 'b', 'c', c='cool', d='duh')
{'a': 'a', 'b': 'b', 'c': 'cool', 'd': 'duh'}
"""
fields = dict(zip(fields, fields))
fields.update(kwargs)
# the special fields 'start' and 'row_count' are implicit but
# may be remapped
if 'start' not in fields.values():
if 'start' in fields:
raise KeyError("reserved field 'start' must be remapped")
fields['start'] = 'start'
if 'row_count' not in fields.values():
if 'row_count' in fields:
raise KeyError("reserved field 'row_count' must be remapped")
fields['row_count'] = 'row_count'
return fields
def _summary_iter(self, fields, fill_holes=False, previous_date=prev_day,
format_date=start_of_day):
"""Generates generic date period summaries of fields in the queryset.
The fields argument should be a dictionary that maps result keys
to valid field names in the queryset.
Arguments:
fields
A dictionary that maps keys to fieldnames (see _map_fields)
fill_holes
If True, create zero count summaries for periods in the
middle of the queryset that contain no records.
previous_date
A function that calculates the start of the next period
prior to a date
format_date
A function that calculates the start of the period for
a date or datetime
"""
# we support remapping the special 'start' field - find it!
start_key = [k for (k, v) in fields.items() if v == 'start'][0]
ac_zero = self.zero_summary(None, **fields)
ac = None
for obj in self:
d = format_date(getattr(obj, self._stats_date_field))
if not ac:
# XXX: add option to fill in holes at end of timeseries?
# prep first non-zero result
ac = ac_zero.copy()
ac[start_key] = d
if ac[start_key] != d:
yield ac
# option: fill holes in middle of timeseries
if fill_holes:
nd = previous_date(ac[start_key])
while nd > d:
ac_fill = ac_zero.copy()
ac_fill[start_key] = nd
yield ac_fill
nd = previous_date(nd)
# prep next non-zero result
ac = ac_zero.copy()
ac[start_key] = d
# accumulate
ac = self._accumulate(obj, ac, **fields)
if ac:
yield ac
# XXX: add option to fill in holes at start of timeseries?
return
class StatsManager(caching.base.CachingManager):
def __init__(self, date_field='date'):
super(StatsManager, self).__init__()
self.date_field = date_field
def contribute_to_class(self, cls, name):
super(StatsManager, self).contribute_to_class(cls, name)
# StatsQuerySet looks for our date field on the model
cls.add_to_class('stats_date_field', self.date_field)
def get_query_set(self):
# The summary methods of StatsQuerySet require `date desc` ordering
# so make that the default as a convenience.
date_order = '-' + self.date_field
# Filter out '0000-00-00' dates which are sadly valid in the
# stats tables but mean nothing for analysis. '0000-00-00' is not
# null and does not have a python equivalent, so we have to filter
# using an inexact comparison.
date_filter = {self.date_field + '__gt': date(1990, 1, 1)}
return (StatsQuerySet(model=self.model)
.filter(**date_filter).order_by(date_order))
class StatsDict(dict):
def __add__(self, d):
"""Combines two dictionaries, summing values where keys overlap.
Example:
>>> a = StatsDict({'a': 1, 'b': 2})
>>> b = StatsDict({'a': 1, 'c': 4})
>>> a + b
{'a': 2, 'b': 2, 'c': 4}
"""
return StatsDict(self._rdict_sum(self, d))
@classmethod
def _rdict_sum(cls, a, b):
"""Recursively sum two dictionaries."""
result = {}
for k in set(a).union(b):
a_val, b_val = (a.get(k, 0), b.get(k, 0))
if isinstance(a_val, dict) and not isinstance(b_val, dict):
b_val = {}
elif isinstance(b_val, dict) and not isinstance(a_val, dict):
a_val = {}
if isinstance(a_val, dict):
result[k] = cls._rdict_sum(a_val, b_val)
else:
result[k] = a_val + b_val
return result
class StatsDictField(models.TextField):
description = 'A dictionary of counts stored as serialized php.'
__metaclass__ = models.SubfieldBase
def db_type(self):
return 'text'
def to_python(self, value):
# object case
if value is None:
return None
if isinstance(value, dict):
return StatsDict(value)
# string case
try:
d = php.unserialize(value)
except ValueError:
d = None
if isinstance(d, dict):
return StatsDict(d)
return None
def get_db_prep_value(self, value):
try:
value = php.serialize(dict(value))
except TypeError:
value = None
return value
def value_to_string(self, obj):
return str(obj)

Просмотреть файл

@ -0,0 +1,134 @@
[
{
"pk": 1,
"model": "addons.addontype",
"fields": {
"name": null,
"name_plural": null,
"description": null,
"modified": "2006-07-11 11:11:11",
"created": "2006-06-11 18:23:31"
}
},
{
"pk": 4,
"model": "addons.addon",
"fields": {
"type": 1,
"status": 4,
"highest_status": 4,
"description": null,
"modified": "2008-05-22 11:59:13",
"name": null,
"created": "2004-06-11 18:23:31"
}
},
{
"pk": 1,
"model": "stats.downloadcount",
"fields": {
"addon": 4,
"count": 10,
"sources": "a:2:{s:6:\"search\";i:3;s:3:\"api\";i:2;}",
"date": "2009-06-01"
}
},
{
"pk": 2,
"model": "stats.downloadcount",
"fields": {
"addon": 4,
"count": 10,
"sources": "a:2:{s:6:\"search\";i:3;s:3:\"api\";i:2;}",
"date": "2009-06-07"
}
},
{
"pk": 3,
"model": "stats.downloadcount",
"fields": {
"addon": 4,
"count": 10,
"sources": "a:2:{s:6:\"search\";i:3;s:3:\"api\";i:2;}",
"date": "2009-06-12"
}
},
{
"pk": 4,
"model": "stats.downloadcount",
"fields": {
"addon": 4,
"count": 10,
"sources": "a:2:{s:6:\"search\";i:3;s:3:\"api\";i:2;}",
"date": "2009-06-20"
}
},
{
"pk": 5,
"model": "stats.downloadcount",
"fields": {
"addon": 4,
"count": 10,
"sources": "a:2:{s:6:\"search\";i:3;s:3:\"api\";i:2;}",
"date": "2009-06-28"
}
},
{
"pk": 6,
"model": "stats.downloadcount",
"fields": {
"addon": 4,
"count": 10,
"sources": "a:2:{s:6:\"search\";i:3;s:3:\"api\";i:2;}",
"date": "2009-07-03"
}
},
{
"pk": 7,
"model": "stats.downloadcount",
"fields": {
"addon": 4,
"count": 10,
"sources": "a:2:{s:6:\"search\";i:3;s:3:\"api\";i:2;}",
"date": "2009-08-03"
}
},
{
"pk": 8,
"model": "stats.downloadcount",
"fields": {
"addon": 4,
"count": 10,
"sources": "a:2:{s:6:\"search\";i:3;s:3:\"api\";i:2;}",
"date": "2009-09-03"
}
},
{
"pk": 1,
"model": "stats.updatecount",
"fields": {
"addon": 4,
"count": 10,
"versions": "a:2:{s:3:\"1.0\";i:2;s:3:\"2.0\";i:8;}",
"statuses": "a:2:{s:11:\"userEnabled\";i:9;s:12:\"userDisabled\";i:1;}",
"applications": "a:1:{s:38:\"{ec8030f7-c20a-464f-9b0e-13a3a9e97384}\";a:1:{s:5:\"3.0.9\";i:7;}}",
"oses": "a:2:{s:5:\"Linux\";i:3;s:5:\"WINNT\";i:7;}",
"locales": null,
"date": "2009-06-01"
}
},
{
"pk": 2,
"model": "stats.updatecount",
"fields": {
"addon": 4,
"count": 10,
"versions": "a:2:{s:3:\"1.0\";i:2;s:3:\"2.0\";i:8;}",
"statuses": "a:2:{s:11:\"userEnabled\";i:9;s:12:\"userDisabled\";i:1;}",
"applications": "a:1:{s:38:\"{ec8030f7-c20a-464f-9b0e-13a3a9e97384}\";a:1:{s:5:\"3.0.9\";i:6;}}",
"oses": "a:2:{s:5:\"Linux\";i:3;s:5:\"WINNT\";i:7;}",
"locales": null,
"date": "2009-06-02"
}
}
]

Просмотреть файл

@ -2,14 +2,18 @@ from django.db import models
import caching.base
from .db import StatsDict, StatsDictField, StatsManager
class DownloadCount(caching.base.CachingMixin, models.Model):
addon = models.ForeignKey('addons.Addon')
count = models.IntegerField()
count = models.PositiveIntegerField()
date = models.DateField()
src = models.TextField()
objects = caching.base.CachingManager()
# Leave this out of queries if you can.
sources = StatsDictField(db_column='src', null=True)
objects = StatsManager('date')
class Meta:
db_table = 'download_counts'
@ -17,15 +21,17 @@ class DownloadCount(caching.base.CachingMixin, models.Model):
class UpdateCount(caching.base.CachingMixin, models.Model):
addon = models.ForeignKey('addons.Addon')
count = models.IntegerField()
count = models.PositiveIntegerField()
date = models.DateField()
# Leave these out of queries if you can.
application = models.TextField()
locale = models.TextField()
os = models.TextField()
status = models.TextField()
version = models.TextField()
versions = StatsDictField(db_column='version', null=True)
statuses = StatsDictField(db_column='status', null=True)
applications = StatsDictField(db_column='application', null=True)
oses = StatsDictField(db_column='os', null=True)
locales = StatsDictField(db_column='locale', null=True)
objects = StatsManager('date')
class Meta:
db_table = 'update_counts'

Просмотреть файл

@ -0,0 +1,45 @@
from datetime import date, datetime
from django import test
from nose.tools import eq_
from stats.db import StatsDict
from stats.db import prev_month
class TestStatsDict(test.TestCase):
dict_a = StatsDict({'a': 3, 'nested': {'b': 5, 'c': 6}})
dict_b = StatsDict({'a': 3, 'b': 1, 'nested': {'b': 5, 'c': 6}})
dict_empty = StatsDict({})
def test_add(self):
d = self.dict_empty + self.dict_empty
eq_(d, self.dict_empty)
d = self.dict_a + self.dict_empty
eq_(d, self.dict_a)
d = self.dict_a + self.dict_b
res = StatsDict({'a': 6, 'b': 1, 'nested': {'b': 10, 'c': 12}})
eq_(d, res)
class TestDateUtils(test.TestCase):
def test_prev_month(self):
from_to = [
(date(2008, 1, 1), date(2007, 12, 1)),
(date(2008, 1, 31), date(2007, 12, 1)),
(date(2008, 2, 1), date(2008, 1, 1)),
(date(2008, 2, 29), date(2008, 1, 1)),
(datetime(2008, 2, 29, 23, 59, 59), date(2008, 1, 1)),
(date(2008, 3, 1), date(2008, 2, 1)),
(date(2008, 3, 31), date(2008, 2, 1)),
(date(2008, 4, 1), date(2008, 3, 1)),
(date(2008, 4, 30), date(2008, 3, 1)),
(date(2008, 12, 1), date(2008, 11, 1)),
(date(2008, 12, 31), date(2008, 11, 1)),
]
for (d, expected) in from_to:
eq_(prev_month(d), expected, 'unexpected prev_month result')

Просмотреть файл

@ -0,0 +1,117 @@
from django import test
from nose.tools import eq_
from stats.models import DownloadCount, UpdateCount, StatsDict
from datetime import date
class TestDownloadCountModel(test.TestCase):
fixtures = ['stats/test_models.json']
def test_sources(self):
dc = DownloadCount.objects.get(id=1)
assert isinstance(dc.sources, StatsDict), 'sources is not a StatsDict'
assert len(dc.sources) > 0, 'sources is empty'
def test_summary(self):
# somewhat contrived, but a good test: summarize the entire dataset
summary = DownloadCount.objects.all().summary(
count_sum='count', sources_sum='sources')
eq_(len(summary), 4, 'unexpected number of keys in summary')
eq_(summary['start'], None, 'start is not None')
assert summary['row_count'] > 0, 'zero rows in summary'
assert summary['count_sum'] > 0, 'zero count_sum in summary'
assert sum(summary['sources_sum'].values()) > 0, \
'zero sources in summary'
def test_remap_special_fields(self):
qs = DownloadCount.objects.filter(pk=1)
days = list(qs.daily_summary(date='start', rows='row_count',
start='count'))
eq_(len(days), 1, 'unexpected number of days')
assert 'date' in days[0], 'date key not in summary results'
assert 'rows' in days[0], 'rows key not in summary results'
assert 'start' in days[0], 'start key not in summary results'
eq_(days[0]['date'], date(2009, 6, 1), 'unexpected date value')
eq_(days[0]['rows'], 1, 'unexpected rows value')
eq_(days[0]['start'], 10, 'unexpected start value')
def test_weekly_summary(self):
qs = DownloadCount.objects.filter(addon=4,
date__range=(date(2009, 6, 1), date(2009, 7, 3)))
weeks = list(qs.weekly_summary('count', 'sources'))
eq_(len(weeks), 5, 'unexpected number of weeks')
eq_(weeks[0]['start'], date(2009, 6, 29),
'unexpected start date for week 1')
eq_(weeks[4]['start'], date(2009, 6, 1),
'unexpected start date for week 5')
eq_(weeks[4]['row_count'], 2, 'unexpected # of rows in week 5')
eq_(weeks[4]['count'], 20, 'unexpected count total in week 5')
eq_(sum(weeks[4]['sources'].values()), 10,
'unexpected sources total in week 5')
def test_monthly_summary(self):
qs = DownloadCount.objects.filter(addon=4,
date__range=(date(2009, 6, 1), date(2009, 9, 30)))
months = list(qs.monthly_summary('count', 'sources'))
eq_(len(months), 4, 'unexpected number of months')
eq_(months[0]['start'], date(2009, 9, 1),
'unexpected start date for month 1')
eq_(months[3]['start'], date(2009, 6, 1),
'unexpected start date for month 4')
eq_(months[3]['row_count'], 5, 'unexpected # of rows in month 4')
eq_(months[3]['count'], 50, 'unexpected count total in month 4')
eq_(sum(months[3]['sources'].values()), 25,
'unexpected sources total in month 4')
def test_daily_fill_holes(self):
qs = DownloadCount.objects.filter(addon=4,
date__range=(date(2009, 6, 1), date(2009, 6, 7)))
days = list(qs.daily_summary('count', 'sources', fill_holes=True))
eq_(len(days), 7, 'unexpected number of days')
eq_(days[1]['start'], date(2009, 6, 6),
'unexpected start date for day 2')
eq_(days[1]['row_count'], 0, 'unexpected non-zero row_count')
eq_(days[1]['count'], 0, 'unexpected non-zero count')
eq_(days[1]['sources'], {}, 'unexpected non-empty sources')
class TestUpdateCountModel(test.TestCase):
fixtures = ['stats/test_models.json']
test_app = '{ec8030f7-c20a-464f-9b0e-13a3a9e97384}'
test_ver = '3.0.9'
def test_serial_types(self):
uc = UpdateCount.objects.get(id=1)
assert isinstance(uc.versions, StatsDict), 'versions not a StatsDict'
assert isinstance(uc.statuses, StatsDict), 'statuses not a StatsDict'
assert isinstance(uc.applications, StatsDict), \
'applications not a StatsDict'
assert isinstance(uc.oses, StatsDict), 'oses not a StatsDict'
assert uc.locales == None, 'locales is not None'
assert len(uc.statuses) > 0, 'statuses is empty'
def test_applications(self):
uc = UpdateCount.objects.get(id=1)
assert isinstance(uc.applications[self.test_app], dict), \
'applications item is not a dict'
assert uc.applications[self.test_app][self.test_ver] == 7, \
'unexpected count for app version'
def test_applications_summary(self):
qs = UpdateCount.objects.filter(addon=4,
date__range=(date(2009, 6, 1), date(2009, 6, 2)))
summary = qs.summary(apps='applications')
eq_(summary['row_count'], 2,
'unexpected row_count in applications summary')
eq_(summary['apps'][self.test_app][self.test_ver], 13,
'unexpected total for app version')