created custom manager and queryset for stats to support common summaries

This also includes a StatDictField that converts serialized php into a summable dictionary
2010-02-05 20:35:26 -08:00 · 2010-02-05 20:35:26 -08:00 · 0f0530630e
--- a/apps/stats/db.py
+++ b/apps/stats/db.py
@ -0,0 +1,342 @@
+from datetime import date, timedelta
+import phpserialize as php
+
+from django.db import models
+
+import caching.base
+
+
+# Common date helpers
+# These all take a date or datetime and return a date.
+
+
+def prev_month(d):
+    """Determine the start date of the previous month."""
+    yr, mo = divmod(d.year * 12 + d.month - 2, 12)
+    return date(yr, mo + 1, 1)
+
+
+def prev_week(d):
+    """Determine the start date of the previous week."""
+    ws = d - timedelta(d.weekday() + 7)
+    return date(ws.year, ws.month, ws.day)
+
+
+def prev_day(d):
+    """Determine the previous day."""
+    pd = d - timedelta(1)
+    return date(pd.year, pd.month, pd.day)
+
+
+def start_of_month(d):
+    """Determine the start of the month for a date or datetime."""
+    return date(d.year, d.month, 1)
+
+
+def start_of_week(d):
+    """Determine the start of the week for a date or datetime."""
+    ws = d - timedelta(d.weekday())
+    return date(ws.year, ws.month, ws.day)
+
+
+def start_of_day(d):
+    """Determine the start of the day for a date or datetime."""
+    return date(d.year, d.month, d.day)
+
+
+class StatsQuerySet(caching.base.CachingQuerySet):
+
+    def __init__(self, *args, **kwargs):
+        super(StatsQuerySet, self).__init__(*args, **kwargs)
+        self._stats_date_field = kwargs['model'].stats_date_field
+
+    def summary(self, *fields, **kwargs):
+        """Summarizes the entire queryset.
+
+        Arguments should be the names of summable fields found in the queryset.
+        Fields may be renamed in the results by using named arguments:
+
+        >>> qs.summary('swallows', dead_parrots='parrots')
+        {'row_count': 1, 'start': None, 'swallows': 10, 'dead_parrots': 7}
+        """
+
+        fields = self._map_fields(*fields, **kwargs)
+        summary = self.zero_summary(None, **fields)
+        for obj in self:
+            summary = self._accumulate(obj, summary, **fields)
+
+        return summary
+
+    def daily_summary(self, *fields, **kwargs):
+        """Generate daily/weekly/monthly summaries on the queryset.
+
+        The queryset must be in reverse chronological order.
+
+        Recognized keyword arguments:
+            fill_holes
+                If True, create zero count summaries for periods in the
+                middle of the queryset that contain no records
+
+        All other arguments should be the names of summable fields found
+        in the queryset. Fields may be renamed in the results by using
+        named arguments, for example:
+
+        >>> [s for s in q.daily_summary('swallows', dead_parrots='parrots')]
+        [{'row_count': 1, 'start': date(2009, 5, 3), 'swallows': 0,
+          'dead_parrots': 2},
+         {'row_count': 1, 'start': date(2009, 5, 2), 'swallows': 10,
+          'dead_parrots': 5}]
+        """
+        fill_holes = kwargs.pop('fill_holes', False)
+        fields = self._map_fields(*fields, **kwargs)
+        return self._summary_iter(fields, fill_holes=fill_holes)
+
+    def weekly_summary(self, *fields, **kwargs):
+        fill_holes = kwargs.pop('fill_holes', False)
+        fields = self._map_fields(*fields, **kwargs)
+        return self._summary_iter(fields, fill_holes=fill_holes,
+            previous_date=prev_week, format_date=start_of_week)
+
+    weekly_summary.__doc__ = daily_summary.__doc__
+
+    def monthly_summary(self, *fields, **kwargs):
+        fill_holes = kwargs.pop('fill_holes', False)
+        fields = self._map_fields(*fields, **kwargs)
+        return self._summary_iter(fields, fill_holes=fill_holes,
+            previous_date=prev_month, format_date=start_of_month)
+
+    monthly_summary.__doc__ = daily_summary.__doc__
+
+    def zero_summary(self, start_date, **fields):
+        """Returns a dictionary of 0 values for specified fields.
+
+        >>> qs.zero_summary(date(2009, 1, 1), count_total='ignored')
+        {'start': date(2009, 1, 1), 'row_count': 0, 'count_total': 0}
+        """
+
+        res = {}
+        for res_key, fname in fields.items():
+            # handle special summary fields
+            if fname == 'start':
+                res[res_key] = start_date
+                continue
+            elif fname == 'row_count':
+                res[res_key] = 0
+                continue
+
+            # handle regular model fields
+            field = self.model._meta.get_field_by_name(fname)[0]
+            if isinstance(field, StatsDictField):
+                # an empty summable dictionary
+                res[res_key] = StatsDict()
+            else:
+                # everything else starts at 0 whether it can be summed or not
+                res[res_key] = 0
+        return res
+
+    def _accumulate(self, obj, ac, **fields):
+        """Accumulates (sums) field values of an object.
+
+        Example:
+
+        >>> ac = {'start': date(2009, 1, 1), 'row_count': 2, 'sum': 10}
+        >>> someobj.count = 4
+        >>> qs._accumulate(someobj, ac, sum='count')
+        {'start': date(2009, 1, 1), 'row_count': 3, 'count_sum': 14}
+        """
+        for ac_key, field in fields.items():
+            # handle special summary fields
+            if field == 'row_count':
+                ac[ac_key] += 1
+                continue
+            elif field == 'start':
+                continue
+
+            # handle regular model fields
+            try:
+                # works with numbers and StatsDict
+                ac[ac_key] = ac[ac_key] + getattr(obj, field)
+            except TypeError:
+                # anything else will keep the initial value (probably 0)
+                pass
+        return ac
+
+    def _map_fields(self, *fields, **kwargs):
+        """Make a map of result key names to model field names.
+
+        Named arguments take precedence, for example:
+
+        >>> qs._map_fields('a', 'b', 'c', c='cool', d='duh')
+        {'a': 'a', 'b': 'b', 'c': 'cool', 'd': 'duh'}
+        """
+        fields = dict(zip(fields, fields))
+        fields.update(kwargs)
+
+        # the special fields 'start' and 'row_count' are implicit but
+        # may be remapped
+        if 'start' not in fields.values():
+            if 'start' in fields:
+                raise KeyError("reserved field 'start' must be remapped")
+            fields['start'] = 'start'
+
+        if 'row_count' not in fields.values():
+            if 'row_count' in fields:
+                raise KeyError("reserved field 'row_count' must be remapped")
+            fields['row_count'] = 'row_count'
+
+        return fields
+
+    def _summary_iter(self, fields, fill_holes=False, previous_date=prev_day,
+                      format_date=start_of_day):
+        """Generates generic date period summaries of fields in the queryset.
+
+        The fields argument should be a dictionary that maps result keys
+        to valid field names in the queryset.
+
+        Arguments:
+            fields
+                A dictionary that maps keys to fieldnames (see _map_fields)
+
+            fill_holes
+                If True, create zero count summaries for periods in the
+                middle of the queryset that contain no records.
+
+            previous_date
+                A function that calculates the start of the next period
+                prior to a date
+
+            format_date
+                A function that calculates the start of the period for
+                a date or datetime
+        """
+        # we support remapping the special 'start' field - find it!
+        start_key = [k for (k, v) in fields.items() if v == 'start'][0]
+
+        ac_zero = self.zero_summary(None, **fields)
+        ac = None
+
+        for obj in self:
+            d = format_date(getattr(obj, self._stats_date_field))
+
+            if not ac:
+                # XXX: add option to fill in holes at end of timeseries?
+                # prep first non-zero result
+                ac = ac_zero.copy()
+                ac[start_key] = d
+
+            if ac[start_key] != d:
+                yield ac
+
+                # option: fill holes in middle of timeseries
+                if fill_holes:
+                    nd = previous_date(ac[start_key])
+                    while nd > d:
+                        ac_fill = ac_zero.copy()
+                        ac_fill[start_key] = nd
+                        yield ac_fill
+                        nd = previous_date(nd)
+
+                # prep next non-zero result
+                ac = ac_zero.copy()
+                ac[start_key] = d
+
+            # accumulate
+            ac = self._accumulate(obj, ac, **fields)
+
+        if ac:
+            yield ac
+
+        # XXX: add option to fill in holes at start of timeseries?
+        return
+
+
+class StatsManager(caching.base.CachingManager):
+
+    def __init__(self, date_field='date'):
+        super(StatsManager, self).__init__()
+        self.date_field = date_field
+
+    def contribute_to_class(self, cls, name):
+        super(StatsManager, self).contribute_to_class(cls, name)
+
+        # StatsQuerySet looks for our date field on the model
+        cls.add_to_class('stats_date_field', self.date_field)
+
+    def get_query_set(self):
+        # The summary methods of StatsQuerySet require `date desc` ordering
+        # so make that the default as a convenience.
+        date_order = '-' + self.date_field
+
+        # Filter out '0000-00-00' dates which are sadly valid in the
+        # stats tables but mean nothing for analysis. '0000-00-00' is not
+        # null and does not have a python equivalent, so we have to filter
+        # using an inexact comparison.
+        date_filter = {self.date_field + '__gt': date(1990, 1, 1)}
+
+        return (StatsQuerySet(model=self.model)
+                .filter(**date_filter).order_by(date_order))
+
+
+class StatsDict(dict):
+
+    def __add__(self, d):
+        """Combines two dictionaries, summing values where keys overlap.
+
+        Example:
+            >>> a = StatsDict({'a': 1, 'b': 2})
+            >>> b = StatsDict({'a': 1, 'c': 4})
+            >>> a + b
+            {'a': 2, 'b': 2, 'c': 4}
+        """
+        return StatsDict(self._rdict_sum(self, d))
+
+    @classmethod
+    def _rdict_sum(cls, a, b):
+        """Recursively sum two dictionaries."""
+        result = {}
+        for k in set(a).union(b):
+            a_val, b_val = (a.get(k, 0), b.get(k, 0))
+            if isinstance(a_val, dict) and not isinstance(b_val, dict):
+                b_val = {}
+            elif isinstance(b_val, dict) and not isinstance(a_val, dict):
+                a_val = {}
+            if isinstance(a_val, dict):
+                result[k] = cls._rdict_sum(a_val, b_val)
+            else:
+                result[k] = a_val + b_val
+        return result
+
+
+class StatsDictField(models.TextField):
+
+    description = 'A dictionary of counts stored as serialized php.'
+    __metaclass__ = models.SubfieldBase
+
+    def db_type(self):
+        return 'text'
+
+    def to_python(self, value):
+        # object case
+        if value is None:
+            return None
+        if isinstance(value, dict):
+            return StatsDict(value)
+
+        # string case
+        try:
+            d = php.unserialize(value)
+        except ValueError:
+            d = None
+        if isinstance(d, dict):
+            return StatsDict(d)
+        return None
+
+    def get_db_prep_value(self, value):
+        try:
+            value = php.serialize(dict(value))
+        except TypeError:
+            value = None
+        return value
+
+    def value_to_string(self, obj):
+        return str(obj)
--- a/apps/stats/fixtures/stats/test_models.json
+++ b/apps/stats/fixtures/stats/test_models.json
@ -0,0 +1,134 @@
+[
+    {
+        "pk": 1,
+        "model": "addons.addontype",
+        "fields": {
+            "name": null,
+            "name_plural": null,
+            "description": null,
+            "modified": "2006-07-11 11:11:11",
+            "created": "2006-06-11 18:23:31"
+        }
+    },
+    {
+        "pk": 4,
+        "model": "addons.addon",
+        "fields": {
+            "type": 1,
+            "status": 4,
+            "highest_status": 4,
+            "description": null,
+            "modified": "2008-05-22 11:59:13",
+            "name": null,
+            "created": "2004-06-11 18:23:31"
+        }
+    },
+    {
+        "pk": 1,
+        "model": "stats.downloadcount",
+        "fields": {
+            "addon": 4,
+            "count": 10,
+            "sources": "a:2:{s:6:\"search\";i:3;s:3:\"api\";i:2;}",
+            "date": "2009-06-01"
+        }
+    },
+    {
+        "pk": 2,
+        "model": "stats.downloadcount",
+        "fields": {
+            "addon": 4,
+            "count": 10,
+            "sources": "a:2:{s:6:\"search\";i:3;s:3:\"api\";i:2;}",
+            "date": "2009-06-07"
+        }
+    },
+    {
+        "pk": 3,
+        "model": "stats.downloadcount",
+        "fields": {
+            "addon": 4,
+            "count": 10,
+            "sources": "a:2:{s:6:\"search\";i:3;s:3:\"api\";i:2;}",
+            "date": "2009-06-12"
+        }
+    },
+    {
+        "pk": 4,
+        "model": "stats.downloadcount",
+        "fields": {
+            "addon": 4,
+            "count": 10,
+            "sources": "a:2:{s:6:\"search\";i:3;s:3:\"api\";i:2;}",
+            "date": "2009-06-20"
+        }
+    },
+    {
+        "pk": 5,
+        "model": "stats.downloadcount",
+        "fields": {
+            "addon": 4,
+            "count": 10,
+            "sources": "a:2:{s:6:\"search\";i:3;s:3:\"api\";i:2;}",
+            "date": "2009-06-28"
+        }
+    },
+    {
+        "pk": 6,
+        "model": "stats.downloadcount",
+        "fields": {
+            "addon": 4,
+            "count": 10,
+            "sources": "a:2:{s:6:\"search\";i:3;s:3:\"api\";i:2;}",
+            "date": "2009-07-03"
+        }
+    },
+    {
+        "pk": 7,
+        "model": "stats.downloadcount",
+        "fields": {
+            "addon": 4,
+            "count": 10,
+            "sources": "a:2:{s:6:\"search\";i:3;s:3:\"api\";i:2;}",
+            "date": "2009-08-03"
+        }
+    },
+    {
+        "pk": 8,
+        "model": "stats.downloadcount",
+        "fields": {
+            "addon": 4,
+            "count": 10,
+            "sources": "a:2:{s:6:\"search\";i:3;s:3:\"api\";i:2;}",
+            "date": "2009-09-03"
+        }
+    },
+    {
+        "pk": 1,
+        "model": "stats.updatecount",
+        "fields": {
+            "addon": 4,
+            "count": 10,
+            "versions": "a:2:{s:3:\"1.0\";i:2;s:3:\"2.0\";i:8;}",
+            "statuses": "a:2:{s:11:\"userEnabled\";i:9;s:12:\"userDisabled\";i:1;}",
+            "applications": "a:1:{s:38:\"{ec8030f7-c20a-464f-9b0e-13a3a9e97384}\";a:1:{s:5:\"3.0.9\";i:7;}}",
+            "oses": "a:2:{s:5:\"Linux\";i:3;s:5:\"WINNT\";i:7;}",
+            "locales": null,
+            "date": "2009-06-01"
+        }
+    },
+    {
+        "pk": 2,
+        "model": "stats.updatecount",
+        "fields": {
+            "addon": 4,
+            "count": 10,
+            "versions": "a:2:{s:3:\"1.0\";i:2;s:3:\"2.0\";i:8;}",
+            "statuses": "a:2:{s:11:\"userEnabled\";i:9;s:12:\"userDisabled\";i:1;}",
+            "applications": "a:1:{s:38:\"{ec8030f7-c20a-464f-9b0e-13a3a9e97384}\";a:1:{s:5:\"3.0.9\";i:6;}}",
+            "oses": "a:2:{s:5:\"Linux\";i:3;s:5:\"WINNT\";i:7;}",
+            "locales": null,
+            "date": "2009-06-02"
+        }
+    }
+]
--- a/apps/stats/models.py
+++ b/apps/stats/models.py
@ -2,14 +2,18 @@ from django.db import models

 import caching.base

+from .db import StatsDict, StatsDictField, StatsManager
+

 class DownloadCount(caching.base.CachingMixin, models.Model):
    addon = models.ForeignKey('addons.Addon')
-    count = models.IntegerField()
+    count = models.PositiveIntegerField()
    date = models.DateField()
-    src = models.TextField()

-    objects = caching.base.CachingManager()
+    # Leave this out of queries if you can.
+    sources = StatsDictField(db_column='src', null=True)
+
+    objects = StatsManager('date')

    class Meta:
        db_table = 'download_counts'
@ -17,15 +21,17 @@ class DownloadCount(caching.base.CachingMixin, models.Model):

 class UpdateCount(caching.base.CachingMixin, models.Model):
    addon = models.ForeignKey('addons.Addon')
-    count = models.IntegerField()
+    count = models.PositiveIntegerField()
    date = models.DateField()

    # Leave these out of queries if you can.
-    application = models.TextField()
-    locale = models.TextField()
-    os = models.TextField()
-    status = models.TextField()
-    version = models.TextField()
+    versions = StatsDictField(db_column='version', null=True)
+    statuses = StatsDictField(db_column='status', null=True)
+    applications = StatsDictField(db_column='application', null=True)
+    oses = StatsDictField(db_column='os', null=True)
+    locales = StatsDictField(db_column='locale', null=True)
+
+    objects = StatsManager('date')

    class Meta:
        db_table = 'update_counts'
--- a/apps/stats/tests/test_db.py
+++ b/apps/stats/tests/test_db.py
@ -0,0 +1,45 @@
+from datetime import date, datetime
+
+from django import test
+
+from nose.tools import eq_
+
+from stats.db import StatsDict
+from stats.db import prev_month
+
+
+class TestStatsDict(test.TestCase):
+    dict_a = StatsDict({'a': 3, 'nested': {'b': 5, 'c': 6}})
+    dict_b = StatsDict({'a': 3, 'b': 1, 'nested': {'b': 5, 'c': 6}})
+    dict_empty = StatsDict({})
+
+    def test_add(self):
+        d = self.dict_empty + self.dict_empty
+        eq_(d, self.dict_empty)
+
+        d = self.dict_a + self.dict_empty
+        eq_(d, self.dict_a)
+
+        d = self.dict_a + self.dict_b
+        res = StatsDict({'a': 6, 'b': 1, 'nested': {'b': 10, 'c': 12}})
+        eq_(d, res)
+
+
+class TestDateUtils(test.TestCase):
+
+    def test_prev_month(self):
+        from_to = [
+            (date(2008, 1, 1), date(2007, 12, 1)),
+            (date(2008, 1, 31), date(2007, 12, 1)),
+            (date(2008, 2, 1), date(2008, 1, 1)),
+            (date(2008, 2, 29), date(2008, 1, 1)),
+            (datetime(2008, 2, 29, 23, 59, 59), date(2008, 1, 1)),
+            (date(2008, 3, 1), date(2008, 2, 1)),
+            (date(2008, 3, 31), date(2008, 2, 1)),
+            (date(2008, 4, 1), date(2008, 3, 1)),
+            (date(2008, 4, 30), date(2008, 3, 1)),
+            (date(2008, 12, 1), date(2008, 11, 1)),
+            (date(2008, 12, 31), date(2008, 11, 1)),
+        ]
+        for (d, expected) in from_to:
+            eq_(prev_month(d), expected, 'unexpected prev_month result')
--- a/apps/stats/tests/test_models.py
+++ b/apps/stats/tests/test_models.py
@ -0,0 +1,117 @@
+from django import test
+from nose.tools import eq_
+from stats.models import DownloadCount, UpdateCount, StatsDict
+
+from datetime import date
+
+
+class TestDownloadCountModel(test.TestCase):
+    fixtures = ['stats/test_models.json']
+
+    def test_sources(self):
+        dc = DownloadCount.objects.get(id=1)
+
+        assert isinstance(dc.sources, StatsDict), 'sources is not a StatsDict'
+        assert len(dc.sources) > 0, 'sources is empty'
+
+    def test_summary(self):
+        # somewhat contrived, but a good test: summarize the entire dataset
+        summary = DownloadCount.objects.all().summary(
+                count_sum='count', sources_sum='sources')
+
+        eq_(len(summary), 4, 'unexpected number of keys in summary')
+        eq_(summary['start'], None, 'start is not None')
+        assert summary['row_count'] > 0, 'zero rows in summary'
+        assert summary['count_sum'] > 0, 'zero count_sum in summary'
+        assert sum(summary['sources_sum'].values()) > 0, \
+                'zero sources in summary'
+
+    def test_remap_special_fields(self):
+        qs = DownloadCount.objects.filter(pk=1)
+        days = list(qs.daily_summary(date='start', rows='row_count',
+                                     start='count'))
+
+        eq_(len(days), 1, 'unexpected number of days')
+        assert 'date' in days[0], 'date key not in summary results'
+        assert 'rows' in days[0], 'rows key not in summary results'
+        assert 'start' in days[0], 'start key not in summary results'
+        eq_(days[0]['date'], date(2009, 6, 1), 'unexpected date value')
+        eq_(days[0]['rows'], 1, 'unexpected rows value')
+        eq_(days[0]['start'], 10, 'unexpected start value')
+
+    def test_weekly_summary(self):
+        qs = DownloadCount.objects.filter(addon=4,
+                date__range=(date(2009, 6, 1), date(2009, 7, 3)))
+        weeks = list(qs.weekly_summary('count', 'sources'))
+
+        eq_(len(weeks), 5, 'unexpected number of weeks')
+        eq_(weeks[0]['start'], date(2009, 6, 29),
+            'unexpected start date for week 1')
+        eq_(weeks[4]['start'], date(2009, 6, 1),
+            'unexpected start date for week 5')
+        eq_(weeks[4]['row_count'], 2, 'unexpected # of rows in week 5')
+        eq_(weeks[4]['count'], 20, 'unexpected count total in week 5')
+        eq_(sum(weeks[4]['sources'].values()), 10,
+            'unexpected sources total in week 5')
+
+    def test_monthly_summary(self):
+        qs = DownloadCount.objects.filter(addon=4,
+                date__range=(date(2009, 6, 1), date(2009, 9, 30)))
+        months = list(qs.monthly_summary('count', 'sources'))
+
+        eq_(len(months), 4, 'unexpected number of months')
+        eq_(months[0]['start'], date(2009, 9, 1),
+            'unexpected start date for month 1')
+        eq_(months[3]['start'], date(2009, 6, 1),
+            'unexpected start date for month 4')
+        eq_(months[3]['row_count'], 5, 'unexpected # of rows in month 4')
+        eq_(months[3]['count'], 50, 'unexpected count total in month 4')
+        eq_(sum(months[3]['sources'].values()), 25,
+                'unexpected sources total in month 4')
+
+    def test_daily_fill_holes(self):
+        qs = DownloadCount.objects.filter(addon=4,
+                date__range=(date(2009, 6, 1), date(2009, 6, 7)))
+        days = list(qs.daily_summary('count', 'sources', fill_holes=True))
+
+        eq_(len(days), 7, 'unexpected number of days')
+        eq_(days[1]['start'], date(2009, 6, 6),
+            'unexpected start date for day 2')
+        eq_(days[1]['row_count'], 0, 'unexpected non-zero row_count')
+        eq_(days[1]['count'], 0, 'unexpected non-zero count')
+        eq_(days[1]['sources'], {}, 'unexpected non-empty sources')
+
+
+class TestUpdateCountModel(test.TestCase):
+    fixtures = ['stats/test_models.json']
+    test_app = '{ec8030f7-c20a-464f-9b0e-13a3a9e97384}'
+    test_ver = '3.0.9'
+
+    def test_serial_types(self):
+        uc = UpdateCount.objects.get(id=1)
+
+        assert isinstance(uc.versions, StatsDict), 'versions not a StatsDict'
+        assert isinstance(uc.statuses, StatsDict), 'statuses not a StatsDict'
+        assert isinstance(uc.applications, StatsDict), \
+            'applications not a StatsDict'
+        assert isinstance(uc.oses, StatsDict), 'oses not a StatsDict'
+        assert uc.locales == None, 'locales is not None'
+        assert len(uc.statuses) > 0, 'statuses is empty'
+
+    def test_applications(self):
+        uc = UpdateCount.objects.get(id=1)
+
+        assert isinstance(uc.applications[self.test_app], dict), \
+            'applications item is not a dict'
+        assert uc.applications[self.test_app][self.test_ver] == 7, \
+            'unexpected count for app version'
+
+    def test_applications_summary(self):
+        qs = UpdateCount.objects.filter(addon=4,
+                date__range=(date(2009, 6, 1), date(2009, 6, 2)))
+        summary = qs.summary(apps='applications')
+
+        eq_(summary['row_count'], 2,
+            'unexpected row_count in applications summary')
+        eq_(summary['apps'][self.test_app][self.test_ver], 13,
+            'unexpected total for app version')