From 0f0530630e184439db47a438bd49913bd0470185 Mon Sep 17 00:00:00 2001 From: Scott McCammon Date: Fri, 5 Feb 2010 20:35:26 -0800 Subject: [PATCH] created custom manager and queryset for stats to support common summaries This also includes a StatDictField that converts serialized php into a summable dictionary --- apps/stats/db.py | 342 +++++++++++++++++++++ apps/stats/fixtures/stats/test_models.json | 134 ++++++++ apps/stats/models.py | 24 +- apps/stats/tests/test_db.py | 45 +++ apps/stats/tests/test_models.py | 117 +++++++ 5 files changed, 653 insertions(+), 9 deletions(-) create mode 100644 apps/stats/db.py create mode 100644 apps/stats/fixtures/stats/test_models.json create mode 100644 apps/stats/tests/test_db.py create mode 100644 apps/stats/tests/test_models.py diff --git a/apps/stats/db.py b/apps/stats/db.py new file mode 100644 index 0000000000..e53b1de6da --- /dev/null +++ b/apps/stats/db.py @@ -0,0 +1,342 @@ +from datetime import date, timedelta +import phpserialize as php + +from django.db import models + +import caching.base + + +# Common date helpers +# These all take a date or datetime and return a date. + + +def prev_month(d): + """Determine the start date of the previous month.""" + yr, mo = divmod(d.year * 12 + d.month - 2, 12) + return date(yr, mo + 1, 1) + + +def prev_week(d): + """Determine the start date of the previous week.""" + ws = d - timedelta(d.weekday() + 7) + return date(ws.year, ws.month, ws.day) + + +def prev_day(d): + """Determine the previous day.""" + pd = d - timedelta(1) + return date(pd.year, pd.month, pd.day) + + +def start_of_month(d): + """Determine the start of the month for a date or datetime.""" + return date(d.year, d.month, 1) + + +def start_of_week(d): + """Determine the start of the week for a date or datetime.""" + ws = d - timedelta(d.weekday()) + return date(ws.year, ws.month, ws.day) + + +def start_of_day(d): + """Determine the start of the day for a date or datetime.""" + return date(d.year, d.month, d.day) + + +class StatsQuerySet(caching.base.CachingQuerySet): + + def __init__(self, *args, **kwargs): + super(StatsQuerySet, self).__init__(*args, **kwargs) + self._stats_date_field = kwargs['model'].stats_date_field + + def summary(self, *fields, **kwargs): + """Summarizes the entire queryset. + + Arguments should be the names of summable fields found in the queryset. + Fields may be renamed in the results by using named arguments: + + >>> qs.summary('swallows', dead_parrots='parrots') + {'row_count': 1, 'start': None, 'swallows': 10, 'dead_parrots': 7} + """ + + fields = self._map_fields(*fields, **kwargs) + summary = self.zero_summary(None, **fields) + for obj in self: + summary = self._accumulate(obj, summary, **fields) + + return summary + + def daily_summary(self, *fields, **kwargs): + """Generate daily/weekly/monthly summaries on the queryset. + + The queryset must be in reverse chronological order. + + Recognized keyword arguments: + fill_holes + If True, create zero count summaries for periods in the + middle of the queryset that contain no records + + All other arguments should be the names of summable fields found + in the queryset. Fields may be renamed in the results by using + named arguments, for example: + + >>> [s for s in q.daily_summary('swallows', dead_parrots='parrots')] + [{'row_count': 1, 'start': date(2009, 5, 3), 'swallows': 0, + 'dead_parrots': 2}, + {'row_count': 1, 'start': date(2009, 5, 2), 'swallows': 10, + 'dead_parrots': 5}] + """ + fill_holes = kwargs.pop('fill_holes', False) + fields = self._map_fields(*fields, **kwargs) + return self._summary_iter(fields, fill_holes=fill_holes) + + def weekly_summary(self, *fields, **kwargs): + fill_holes = kwargs.pop('fill_holes', False) + fields = self._map_fields(*fields, **kwargs) + return self._summary_iter(fields, fill_holes=fill_holes, + previous_date=prev_week, format_date=start_of_week) + + weekly_summary.__doc__ = daily_summary.__doc__ + + def monthly_summary(self, *fields, **kwargs): + fill_holes = kwargs.pop('fill_holes', False) + fields = self._map_fields(*fields, **kwargs) + return self._summary_iter(fields, fill_holes=fill_holes, + previous_date=prev_month, format_date=start_of_month) + + monthly_summary.__doc__ = daily_summary.__doc__ + + def zero_summary(self, start_date, **fields): + """Returns a dictionary of 0 values for specified fields. + + >>> qs.zero_summary(date(2009, 1, 1), count_total='ignored') + {'start': date(2009, 1, 1), 'row_count': 0, 'count_total': 0} + """ + + res = {} + for res_key, fname in fields.items(): + # handle special summary fields + if fname == 'start': + res[res_key] = start_date + continue + elif fname == 'row_count': + res[res_key] = 0 + continue + + # handle regular model fields + field = self.model._meta.get_field_by_name(fname)[0] + if isinstance(field, StatsDictField): + # an empty summable dictionary + res[res_key] = StatsDict() + else: + # everything else starts at 0 whether it can be summed or not + res[res_key] = 0 + return res + + def _accumulate(self, obj, ac, **fields): + """Accumulates (sums) field values of an object. + + Example: + + >>> ac = {'start': date(2009, 1, 1), 'row_count': 2, 'sum': 10} + >>> someobj.count = 4 + >>> qs._accumulate(someobj, ac, sum='count') + {'start': date(2009, 1, 1), 'row_count': 3, 'count_sum': 14} + """ + for ac_key, field in fields.items(): + # handle special summary fields + if field == 'row_count': + ac[ac_key] += 1 + continue + elif field == 'start': + continue + + # handle regular model fields + try: + # works with numbers and StatsDict + ac[ac_key] = ac[ac_key] + getattr(obj, field) + except TypeError: + # anything else will keep the initial value (probably 0) + pass + return ac + + def _map_fields(self, *fields, **kwargs): + """Make a map of result key names to model field names. + + Named arguments take precedence, for example: + + >>> qs._map_fields('a', 'b', 'c', c='cool', d='duh') + {'a': 'a', 'b': 'b', 'c': 'cool', 'd': 'duh'} + """ + fields = dict(zip(fields, fields)) + fields.update(kwargs) + + # the special fields 'start' and 'row_count' are implicit but + # may be remapped + if 'start' not in fields.values(): + if 'start' in fields: + raise KeyError("reserved field 'start' must be remapped") + fields['start'] = 'start' + + if 'row_count' not in fields.values(): + if 'row_count' in fields: + raise KeyError("reserved field 'row_count' must be remapped") + fields['row_count'] = 'row_count' + + return fields + + def _summary_iter(self, fields, fill_holes=False, previous_date=prev_day, + format_date=start_of_day): + """Generates generic date period summaries of fields in the queryset. + + The fields argument should be a dictionary that maps result keys + to valid field names in the queryset. + + Arguments: + fields + A dictionary that maps keys to fieldnames (see _map_fields) + + fill_holes + If True, create zero count summaries for periods in the + middle of the queryset that contain no records. + + previous_date + A function that calculates the start of the next period + prior to a date + + format_date + A function that calculates the start of the period for + a date or datetime + """ + # we support remapping the special 'start' field - find it! + start_key = [k for (k, v) in fields.items() if v == 'start'][0] + + ac_zero = self.zero_summary(None, **fields) + ac = None + + for obj in self: + d = format_date(getattr(obj, self._stats_date_field)) + + if not ac: + # XXX: add option to fill in holes at end of timeseries? + # prep first non-zero result + ac = ac_zero.copy() + ac[start_key] = d + + if ac[start_key] != d: + yield ac + + # option: fill holes in middle of timeseries + if fill_holes: + nd = previous_date(ac[start_key]) + while nd > d: + ac_fill = ac_zero.copy() + ac_fill[start_key] = nd + yield ac_fill + nd = previous_date(nd) + + # prep next non-zero result + ac = ac_zero.copy() + ac[start_key] = d + + # accumulate + ac = self._accumulate(obj, ac, **fields) + + if ac: + yield ac + + # XXX: add option to fill in holes at start of timeseries? + return + + +class StatsManager(caching.base.CachingManager): + + def __init__(self, date_field='date'): + super(StatsManager, self).__init__() + self.date_field = date_field + + def contribute_to_class(self, cls, name): + super(StatsManager, self).contribute_to_class(cls, name) + + # StatsQuerySet looks for our date field on the model + cls.add_to_class('stats_date_field', self.date_field) + + def get_query_set(self): + # The summary methods of StatsQuerySet require `date desc` ordering + # so make that the default as a convenience. + date_order = '-' + self.date_field + + # Filter out '0000-00-00' dates which are sadly valid in the + # stats tables but mean nothing for analysis. '0000-00-00' is not + # null and does not have a python equivalent, so we have to filter + # using an inexact comparison. + date_filter = {self.date_field + '__gt': date(1990, 1, 1)} + + return (StatsQuerySet(model=self.model) + .filter(**date_filter).order_by(date_order)) + + +class StatsDict(dict): + + def __add__(self, d): + """Combines two dictionaries, summing values where keys overlap. + + Example: + >>> a = StatsDict({'a': 1, 'b': 2}) + >>> b = StatsDict({'a': 1, 'c': 4}) + >>> a + b + {'a': 2, 'b': 2, 'c': 4} + """ + return StatsDict(self._rdict_sum(self, d)) + + @classmethod + def _rdict_sum(cls, a, b): + """Recursively sum two dictionaries.""" + result = {} + for k in set(a).union(b): + a_val, b_val = (a.get(k, 0), b.get(k, 0)) + if isinstance(a_val, dict) and not isinstance(b_val, dict): + b_val = {} + elif isinstance(b_val, dict) and not isinstance(a_val, dict): + a_val = {} + if isinstance(a_val, dict): + result[k] = cls._rdict_sum(a_val, b_val) + else: + result[k] = a_val + b_val + return result + + +class StatsDictField(models.TextField): + + description = 'A dictionary of counts stored as serialized php.' + __metaclass__ = models.SubfieldBase + + def db_type(self): + return 'text' + + def to_python(self, value): + # object case + if value is None: + return None + if isinstance(value, dict): + return StatsDict(value) + + # string case + try: + d = php.unserialize(value) + except ValueError: + d = None + if isinstance(d, dict): + return StatsDict(d) + return None + + def get_db_prep_value(self, value): + try: + value = php.serialize(dict(value)) + except TypeError: + value = None + return value + + def value_to_string(self, obj): + return str(obj) diff --git a/apps/stats/fixtures/stats/test_models.json b/apps/stats/fixtures/stats/test_models.json new file mode 100644 index 0000000000..bf23439640 --- /dev/null +++ b/apps/stats/fixtures/stats/test_models.json @@ -0,0 +1,134 @@ +[ + { + "pk": 1, + "model": "addons.addontype", + "fields": { + "name": null, + "name_plural": null, + "description": null, + "modified": "2006-07-11 11:11:11", + "created": "2006-06-11 18:23:31" + } + }, + { + "pk": 4, + "model": "addons.addon", + "fields": { + "type": 1, + "status": 4, + "highest_status": 4, + "description": null, + "modified": "2008-05-22 11:59:13", + "name": null, + "created": "2004-06-11 18:23:31" + } + }, + { + "pk": 1, + "model": "stats.downloadcount", + "fields": { + "addon": 4, + "count": 10, + "sources": "a:2:{s:6:\"search\";i:3;s:3:\"api\";i:2;}", + "date": "2009-06-01" + } + }, + { + "pk": 2, + "model": "stats.downloadcount", + "fields": { + "addon": 4, + "count": 10, + "sources": "a:2:{s:6:\"search\";i:3;s:3:\"api\";i:2;}", + "date": "2009-06-07" + } + }, + { + "pk": 3, + "model": "stats.downloadcount", + "fields": { + "addon": 4, + "count": 10, + "sources": "a:2:{s:6:\"search\";i:3;s:3:\"api\";i:2;}", + "date": "2009-06-12" + } + }, + { + "pk": 4, + "model": "stats.downloadcount", + "fields": { + "addon": 4, + "count": 10, + "sources": "a:2:{s:6:\"search\";i:3;s:3:\"api\";i:2;}", + "date": "2009-06-20" + } + }, + { + "pk": 5, + "model": "stats.downloadcount", + "fields": { + "addon": 4, + "count": 10, + "sources": "a:2:{s:6:\"search\";i:3;s:3:\"api\";i:2;}", + "date": "2009-06-28" + } + }, + { + "pk": 6, + "model": "stats.downloadcount", + "fields": { + "addon": 4, + "count": 10, + "sources": "a:2:{s:6:\"search\";i:3;s:3:\"api\";i:2;}", + "date": "2009-07-03" + } + }, + { + "pk": 7, + "model": "stats.downloadcount", + "fields": { + "addon": 4, + "count": 10, + "sources": "a:2:{s:6:\"search\";i:3;s:3:\"api\";i:2;}", + "date": "2009-08-03" + } + }, + { + "pk": 8, + "model": "stats.downloadcount", + "fields": { + "addon": 4, + "count": 10, + "sources": "a:2:{s:6:\"search\";i:3;s:3:\"api\";i:2;}", + "date": "2009-09-03" + } + }, + { + "pk": 1, + "model": "stats.updatecount", + "fields": { + "addon": 4, + "count": 10, + "versions": "a:2:{s:3:\"1.0\";i:2;s:3:\"2.0\";i:8;}", + "statuses": "a:2:{s:11:\"userEnabled\";i:9;s:12:\"userDisabled\";i:1;}", + "applications": "a:1:{s:38:\"{ec8030f7-c20a-464f-9b0e-13a3a9e97384}\";a:1:{s:5:\"3.0.9\";i:7;}}", + "oses": "a:2:{s:5:\"Linux\";i:3;s:5:\"WINNT\";i:7;}", + "locales": null, + "date": "2009-06-01" + } + }, + { + "pk": 2, + "model": "stats.updatecount", + "fields": { + "addon": 4, + "count": 10, + "versions": "a:2:{s:3:\"1.0\";i:2;s:3:\"2.0\";i:8;}", + "statuses": "a:2:{s:11:\"userEnabled\";i:9;s:12:\"userDisabled\";i:1;}", + "applications": "a:1:{s:38:\"{ec8030f7-c20a-464f-9b0e-13a3a9e97384}\";a:1:{s:5:\"3.0.9\";i:6;}}", + "oses": "a:2:{s:5:\"Linux\";i:3;s:5:\"WINNT\";i:7;}", + "locales": null, + "date": "2009-06-02" + } + } +] diff --git a/apps/stats/models.py b/apps/stats/models.py index a27c057002..907f55307b 100644 --- a/apps/stats/models.py +++ b/apps/stats/models.py @@ -2,14 +2,18 @@ from django.db import models import caching.base +from .db import StatsDict, StatsDictField, StatsManager + class DownloadCount(caching.base.CachingMixin, models.Model): addon = models.ForeignKey('addons.Addon') - count = models.IntegerField() + count = models.PositiveIntegerField() date = models.DateField() - src = models.TextField() - objects = caching.base.CachingManager() + # Leave this out of queries if you can. + sources = StatsDictField(db_column='src', null=True) + + objects = StatsManager('date') class Meta: db_table = 'download_counts' @@ -17,15 +21,17 @@ class DownloadCount(caching.base.CachingMixin, models.Model): class UpdateCount(caching.base.CachingMixin, models.Model): addon = models.ForeignKey('addons.Addon') - count = models.IntegerField() + count = models.PositiveIntegerField() date = models.DateField() # Leave these out of queries if you can. - application = models.TextField() - locale = models.TextField() - os = models.TextField() - status = models.TextField() - version = models.TextField() + versions = StatsDictField(db_column='version', null=True) + statuses = StatsDictField(db_column='status', null=True) + applications = StatsDictField(db_column='application', null=True) + oses = StatsDictField(db_column='os', null=True) + locales = StatsDictField(db_column='locale', null=True) + + objects = StatsManager('date') class Meta: db_table = 'update_counts' diff --git a/apps/stats/tests/test_db.py b/apps/stats/tests/test_db.py new file mode 100644 index 0000000000..ec60ae0ed8 --- /dev/null +++ b/apps/stats/tests/test_db.py @@ -0,0 +1,45 @@ +from datetime import date, datetime + +from django import test + +from nose.tools import eq_ + +from stats.db import StatsDict +from stats.db import prev_month + + +class TestStatsDict(test.TestCase): + dict_a = StatsDict({'a': 3, 'nested': {'b': 5, 'c': 6}}) + dict_b = StatsDict({'a': 3, 'b': 1, 'nested': {'b': 5, 'c': 6}}) + dict_empty = StatsDict({}) + + def test_add(self): + d = self.dict_empty + self.dict_empty + eq_(d, self.dict_empty) + + d = self.dict_a + self.dict_empty + eq_(d, self.dict_a) + + d = self.dict_a + self.dict_b + res = StatsDict({'a': 6, 'b': 1, 'nested': {'b': 10, 'c': 12}}) + eq_(d, res) + + +class TestDateUtils(test.TestCase): + + def test_prev_month(self): + from_to = [ + (date(2008, 1, 1), date(2007, 12, 1)), + (date(2008, 1, 31), date(2007, 12, 1)), + (date(2008, 2, 1), date(2008, 1, 1)), + (date(2008, 2, 29), date(2008, 1, 1)), + (datetime(2008, 2, 29, 23, 59, 59), date(2008, 1, 1)), + (date(2008, 3, 1), date(2008, 2, 1)), + (date(2008, 3, 31), date(2008, 2, 1)), + (date(2008, 4, 1), date(2008, 3, 1)), + (date(2008, 4, 30), date(2008, 3, 1)), + (date(2008, 12, 1), date(2008, 11, 1)), + (date(2008, 12, 31), date(2008, 11, 1)), + ] + for (d, expected) in from_to: + eq_(prev_month(d), expected, 'unexpected prev_month result') diff --git a/apps/stats/tests/test_models.py b/apps/stats/tests/test_models.py new file mode 100644 index 0000000000..53c34e82bf --- /dev/null +++ b/apps/stats/tests/test_models.py @@ -0,0 +1,117 @@ +from django import test +from nose.tools import eq_ +from stats.models import DownloadCount, UpdateCount, StatsDict + +from datetime import date + + +class TestDownloadCountModel(test.TestCase): + fixtures = ['stats/test_models.json'] + + def test_sources(self): + dc = DownloadCount.objects.get(id=1) + + assert isinstance(dc.sources, StatsDict), 'sources is not a StatsDict' + assert len(dc.sources) > 0, 'sources is empty' + + def test_summary(self): + # somewhat contrived, but a good test: summarize the entire dataset + summary = DownloadCount.objects.all().summary( + count_sum='count', sources_sum='sources') + + eq_(len(summary), 4, 'unexpected number of keys in summary') + eq_(summary['start'], None, 'start is not None') + assert summary['row_count'] > 0, 'zero rows in summary' + assert summary['count_sum'] > 0, 'zero count_sum in summary' + assert sum(summary['sources_sum'].values()) > 0, \ + 'zero sources in summary' + + def test_remap_special_fields(self): + qs = DownloadCount.objects.filter(pk=1) + days = list(qs.daily_summary(date='start', rows='row_count', + start='count')) + + eq_(len(days), 1, 'unexpected number of days') + assert 'date' in days[0], 'date key not in summary results' + assert 'rows' in days[0], 'rows key not in summary results' + assert 'start' in days[0], 'start key not in summary results' + eq_(days[0]['date'], date(2009, 6, 1), 'unexpected date value') + eq_(days[0]['rows'], 1, 'unexpected rows value') + eq_(days[0]['start'], 10, 'unexpected start value') + + def test_weekly_summary(self): + qs = DownloadCount.objects.filter(addon=4, + date__range=(date(2009, 6, 1), date(2009, 7, 3))) + weeks = list(qs.weekly_summary('count', 'sources')) + + eq_(len(weeks), 5, 'unexpected number of weeks') + eq_(weeks[0]['start'], date(2009, 6, 29), + 'unexpected start date for week 1') + eq_(weeks[4]['start'], date(2009, 6, 1), + 'unexpected start date for week 5') + eq_(weeks[4]['row_count'], 2, 'unexpected # of rows in week 5') + eq_(weeks[4]['count'], 20, 'unexpected count total in week 5') + eq_(sum(weeks[4]['sources'].values()), 10, + 'unexpected sources total in week 5') + + def test_monthly_summary(self): + qs = DownloadCount.objects.filter(addon=4, + date__range=(date(2009, 6, 1), date(2009, 9, 30))) + months = list(qs.monthly_summary('count', 'sources')) + + eq_(len(months), 4, 'unexpected number of months') + eq_(months[0]['start'], date(2009, 9, 1), + 'unexpected start date for month 1') + eq_(months[3]['start'], date(2009, 6, 1), + 'unexpected start date for month 4') + eq_(months[3]['row_count'], 5, 'unexpected # of rows in month 4') + eq_(months[3]['count'], 50, 'unexpected count total in month 4') + eq_(sum(months[3]['sources'].values()), 25, + 'unexpected sources total in month 4') + + def test_daily_fill_holes(self): + qs = DownloadCount.objects.filter(addon=4, + date__range=(date(2009, 6, 1), date(2009, 6, 7))) + days = list(qs.daily_summary('count', 'sources', fill_holes=True)) + + eq_(len(days), 7, 'unexpected number of days') + eq_(days[1]['start'], date(2009, 6, 6), + 'unexpected start date for day 2') + eq_(days[1]['row_count'], 0, 'unexpected non-zero row_count') + eq_(days[1]['count'], 0, 'unexpected non-zero count') + eq_(days[1]['sources'], {}, 'unexpected non-empty sources') + + +class TestUpdateCountModel(test.TestCase): + fixtures = ['stats/test_models.json'] + test_app = '{ec8030f7-c20a-464f-9b0e-13a3a9e97384}' + test_ver = '3.0.9' + + def test_serial_types(self): + uc = UpdateCount.objects.get(id=1) + + assert isinstance(uc.versions, StatsDict), 'versions not a StatsDict' + assert isinstance(uc.statuses, StatsDict), 'statuses not a StatsDict' + assert isinstance(uc.applications, StatsDict), \ + 'applications not a StatsDict' + assert isinstance(uc.oses, StatsDict), 'oses not a StatsDict' + assert uc.locales == None, 'locales is not None' + assert len(uc.statuses) > 0, 'statuses is empty' + + def test_applications(self): + uc = UpdateCount.objects.get(id=1) + + assert isinstance(uc.applications[self.test_app], dict), \ + 'applications item is not a dict' + assert uc.applications[self.test_app][self.test_ver] == 7, \ + 'unexpected count for app version' + + def test_applications_summary(self): + qs = UpdateCount.objects.filter(addon=4, + date__range=(date(2009, 6, 1), date(2009, 6, 2))) + summary = qs.summary(apps='applications') + + eq_(summary['row_count'], 2, + 'unexpected row_count in applications summary') + eq_(summary['apps'][self.test_app][self.test_ver], 13, + 'unexpected total for app version')