diff --git a/apps/jetpack/cron.py b/apps/jetpack/cron.py index a9ce0c6e..c66e6484 100644 --- a/apps/jetpack/cron.py +++ b/apps/jetpack/cron.py @@ -8,9 +8,15 @@ from django.conf import settings import commonware import cronjobs -from .models import Package from . import tasks +from jetpack.models import Package +from jetpack import tasks + +from celery.messaging import establish_connection +from celeryutils import chunked +from elasticutils import get_es + log = commonware.log.getLogger('f.cron') def _prune_older_files(directory, age): @@ -52,16 +58,10 @@ def gc(): @cronjobs.register -def package_activity(): - """ - Collect all Packages, and update their daily_activity based - on if they have been active today. - - Should be run nightly. - """ - tasks.fill_package_activity.delay(full_year=False) - - -@cronjobs.register -def fill_package_activity(): - tasks.fill_package_activity.delay(full_year=True) +def update_package_activity(): + """Recalculates package activity rating for all packages""" + ids = Package.objects.all().values_list('id', flat=True) + log.info("Updating package activity for %s packages" % len(ids)) + with establish_connection() as conn: + for chunk in chunked(ids, 100): + tasks.calculate_activity_rating.apply_async(args=[chunk], connection=conn) diff --git a/apps/jetpack/models.py b/apps/jetpack/models.py index d46591de..337c6a18 100644 --- a/apps/jetpack/models.py +++ b/apps/jetpack/models.py @@ -10,6 +10,7 @@ import tempfile import markdown import hashlib import codecs +from decimal import Decimal, getcontext from copy import deepcopy from django.core.exceptions import ObjectDoesNotExist, ValidationError @@ -23,6 +24,7 @@ from django.core.urlresolvers import reverse from django.template.defaultfilters import slugify from django.conf import settings from django.utils.translation import ugettext as _ +from django.db import connection from cuddlefish.preflight import vk_to_jid, jid_to_programid, my_b32encode from ecdsa import SigningKey, NIST256p @@ -1468,10 +1470,9 @@ class Package(BaseModel, SearchMixin): active = models.BooleanField(default=True, blank=True) # deleted is the limbo state deleted = models.BooleanField(default=False, blank=True) - - # activity - year_of_activity = models.CharField(max_length=365, default='0'*365) - activity_updated_at = models.DateTimeField(null=True, blank=True) + + #package activity score + activity_rating = models.DecimalField(default=0.0, max_digits=4, decimal_places=3) class Meta: " Set the ordering of objects " @@ -1781,42 +1782,48 @@ class Package(BaseModel, SearchMixin): if self.version_name: self.version_name = alphanum_plus(self.version_name) - def get_activity_rating(self): + def calc_activity_rating(self): """ - Build a weighted average based on activity from daily_activity - and recency of that activity. + Build a weighted average based on package revisions """ - # slices are by week - # first couple weeks are weighted high - # rest of the weeks are super tiny points - slice_size = 7 - slices = len(self.year_of_activity) / slice_size - - weights = { - '0': 0.20, - '1': 0.15, - '2': 0.10, - '3': 0.05, - } - - remaining_percentage = 1.0 - sum(weights.values()) - standard_weight = remaining_percentage / (slices - len(weights.keys())) - total = 0 - - for i in range(slices): - # slice_ is like '1100101' - slice_start = i * slice_size - slice_end = slice_start + slice_size - points = self.year_of_activity.count('1', slice_start, slice_end) - - weight = weights.get(str(i), standard_weight) - weighted_points = points * weight - - total = total + weighted_points - - rating = total / slice_size - return rating - + + getcontext().prec = 3 + + #update tests if you change this. + weights = [ + { 'start': 1, 'end': 7, 'weight': Decimal('0.30') }, + { 'start': 8, 'end': 14, 'weight': Decimal('0.20') }, + { 'start': 15, 'end': 21, 'weight': Decimal('0.15') }, + { 'start': 22, 'end': 52, 'weight': Decimal('0.15') }, + { 'start': 53, 'end': 365, 'weight': Decimal('0.20') } + ] + + q = [] + + for idx, w in enumerate(weights): + q.append(""" + SELECT count(Days)/{3}, {4} as Row FROM + (SELECT count(*) as Days FROM jetpack_packagerevision + WHERE + package_id = {0} AND + TO_DAYS(created_at) <= TO_DAYS(DATE_SUB(CURDATE(), INTERVAL {1} DAY)) AND + TO_DAYS(created_at) >= TO_DAYS(DATE_SUB(CURDATE(), INTERVAL {2} DAY)) + group by package_id, TO_DAYS(created_at)) x + """.format(self.id,w['start'],w['end'],w['end']+1 - w['start'], idx)) + + query = " UNION ".join(q) + + + cursor = connection.cursor() + cursor.execute(query) + + result = Decimal('0') + + for idx, val in enumerate([row[0] for row in cursor.fetchall()]): + result += weights[idx]['weight'] * val + + return result + @es_required def refresh_index(self, es, bulk=False): @@ -1830,10 +1837,14 @@ class Package(BaseModel, SearchMixin): .exclude(package=self) .values_list('package_id', flat=True))) data['copies_count'] = len(data['copies']) - - del data['year_of_activity'] - data['activity'] = self.get_activity_rating() - + + log.debug(data) + + # hack for ES, because a decimal is serialized as 'Decimal('0.302')' + # so we must convert that to a float + data['activity'] = float(self.activity_rating) + del data['activity_rating'] + try: if self.latest: deps = self.latest.dependencies.all() diff --git a/apps/jetpack/tasks.py b/apps/jetpack/tasks.py index 90526d70..11d648bc 100644 --- a/apps/jetpack/tasks.py +++ b/apps/jetpack/tasks.py @@ -2,47 +2,22 @@ import datetime import commonware.log from celery.decorators import task -from .models import Package +from jetpack.models import Package +from elasticutils import get_es log = commonware.log.getLogger('f.celery') + @task -def fill_package_activity(full_year=False, *args, **kwargs): - """ - Collect all the revisions for each package, distinct by day, in the past year - and determine the year of activity. - """ - log.info('Inserting data into year_of_activity.') - pkgs = Package.objects.filter(deleted=False) - now = datetime.datetime.utcnow() - year = 365 - last_year = now - datetime.timedelta(year) - - for pkg in pkgs: - if full_year or not pkg.activity_updated_at: - days = year - time_since = last_year - else: - time_since = pkg.activity_updated_at - days = (now - time_since).days - - if days <= 0: - continue - - revs = (pkg.revisions.filter(created_at__gte=time_since) - .order_by('-created_at')) - - activity = list('0'*days) - - for rev in revs: - day = (now - rev.created_at).days - activity[day] = '1' - - - activity = ''.join(activity) - pkg.year_of_activity = activity + pkg.year_of_activity[:-days] - pkg.activity_updated_at = now - pkg.save() - - log.info('Finished filling data into year_of_activity.') - +def calculate_activity_rating(pks,**kw): + ids_str = ','.join(map(str, pks)) + log.debug('ES starting calculate_activity_rating for packages: [%s]' + % ids_str) + + for package in Package.objects.filter(pk__in=pks): + package.activity_rating = package.calc_activity_rating() + package.save() + + log.debug('ES completed calculate_activity_rating for packages: [%s]' + % ids_str) + \ No newline at end of file diff --git a/apps/jetpack/tests/package_tests.py b/apps/jetpack/tests/package_tests.py index f189968c..9cda49c3 100644 --- a/apps/jetpack/tests/package_tests.py +++ b/apps/jetpack/tests/package_tests.py @@ -1,6 +1,7 @@ import os import datetime import commonware +from decimal import Decimal from test_utils import TestCase from nose import SkipTest @@ -332,14 +333,40 @@ class PackageTest(TestCase): addon_saved = Package.objects.get(author=self.author, type='a') eq_(addon_saved.description, description) - def test_activity_rating(self): - pack = Package() + def test_activity_rating_calculation_one_year(self): + addon = Package.objects.create(author=self.author, type='a') + + eq_(0, addon.calc_activity_rating()) - pack.year_of_activity = '0' * 365 - eq_(pack.get_activity_rating(), 0) + now = datetime.datetime.utcnow() + + for i in range(1,366): + r = addon.revisions.create(author=self.author, revision_number=i) + r.created_at=now-datetime.timedelta(i) + super(PackageRevision, r).save() + + #created packages, including initial + eq_(366, addon.revisions.count()) + eq_(Decimal('1'), addon.calc_activity_rating()) + + def test_activity_rating_calculation_first_week(self): + addon = Package(type='a', author=self.author) + addon.save() - pack.year_of_activity = '1' * 365 - eq_(pack.get_activity_rating() > 0.99, True) + now = datetime.datetime.utcnow() + + # Create 1 weeks worth of revisions... should equal .30 of score + # see models.py def Packages for weights + + for i in range(1,8): + r = addon.revisions.create(author=self.author, revision_number=i) + r.created_at=now-datetime.timedelta(i) + super(PackageRevision, r).save() + + eq_(8, addon.revisions.count()) + + eq_(Decimal('0.300'), addon.calc_activity_rating()) + diff --git a/apps/jetpack/tests/revision_tests.py b/apps/jetpack/tests/revision_tests.py index 17abfa31..ee6f1ece 100644 --- a/apps/jetpack/tests/revision_tests.py +++ b/apps/jetpack/tests/revision_tests.py @@ -2,6 +2,7 @@ import commonware import tempfile import os import datetime +import decimal from test_utils import TestCase @@ -10,10 +11,11 @@ from nose.tools import eq_ from django.contrib.auth.models import User from django.conf import settings +from jetpack.tasks import calculate_activity_rating from jetpack.models import Package, PackageRevision, Module, Attachment, SDK from jetpack.errors import SelfDependencyException, FilenameExistException, \ DependencyException -from jetpack.tasks import fill_package_activity + from base.templatetags.base_helpers import hashtag log = commonware.log.getLogger('f.test') @@ -391,48 +393,27 @@ class PackageRevisionTest(TestCase): assert old_rev.name assert old_package.full_name assert old_package.name - - def test_fill_package_activity(self): - orig = '0'*365 + + def test_update_package_activity_cron(self): addon = Package(type='a', author=self.author) addon.save() - eq_(addon.year_of_activity, orig) - - fill_package_activity.delay() - - addon = Package.objects.get(pk=addon.pk) - - new = '1' + orig[:-1] - eq_(addon.year_of_activity, new) - - def test_package_activity_cron(self): - addon = Package(type='a', author=self.author) - addon.save() - fill_package_activity.delay(full_year=True) - now = datetime.datetime.utcnow() - # Superficially creating revisions in the past - r2 = addon.revisions.create(author=self.author, revision_number=2) - r2.created_at=now-datetime.timedelta(5) - super(PackageRevision, r2).save() - - r3 = addon.revisions.create(author=self.author, revision_number=3) - r3.created_at=now-datetime.timedelta(3) - super(PackageRevision, r3).save() - + # Create 1 weeks worth of revisions... should equal .30 of score + # see models.py def Packages for weights + + for i in range(1,8): + r = addon.revisions.create(author=self.author, revision_number=i) + r.created_at=now-datetime.timedelta(i) + super(PackageRevision, r).save() + + #run task on this one package + calculate_activity_rating([addon.pk]) + addon = Package.objects.get(pk=addon.pk) - addon.activity_updated_at = now - datetime.timedelta(4) - addon.save() - - - old = addon.year_of_activity - fill_package_activity.delay(full_year=False) - - addon = Package.objects.get(pk=addon.pk) - - eq_('1001'+old[:-4], addon.year_of_activity) + + eq_(addon.activity_rating, addon.calc_activity_rating()) diff --git a/apps/search/cron.py b/apps/search/cron.py index 94bced19..b4756a78 100644 --- a/apps/search/cron.py +++ b/apps/search/cron.py @@ -22,6 +22,10 @@ def index_all(): for chunk in chunked(ids, 100): tasks.index_all.apply_async(args=[chunk], connection=conn) + + + + @cronjobs.register def setup_mapping(): """Create index, and setup mapping, for ES.""" diff --git a/apps/search/helpers.py b/apps/search/helpers.py index 0addea6d..705c415b 100644 --- a/apps/search/helpers.py +++ b/apps/search/helpers.py @@ -73,13 +73,16 @@ def _get_average_activity(): return average # TODO: ES has statistical facet that can provide average, but I couldn't # get it working. - qs = Package.search().filter(activity__gt=0) + + qs = Package.search().filter(activity__gt=0.001) values = qs.values('activity')[:qs.count()] + num = len(values) + if num > 0: average = sum(v[1] for v in values) / num else: average = 0.2 - + cache.set(ACTIVITY_CACHE_KEY, average, 60*60*24) return average diff --git a/apps/search/tasks.py b/apps/search/tasks.py index 25e8d327..558b205b 100644 --- a/apps/search/tasks.py +++ b/apps/search/tasks.py @@ -24,7 +24,11 @@ def index_all(pks, **kw): else: log.debug('ES finished bulk action for packages: [%s]' % ids_str) + @task def index_one(pk, **kw): package = Package.objects.get(pk=pk) - package.refresh_index() \ No newline at end of file + package.refresh_index() + + + diff --git a/migrations/019-refactor_package_activity.sql b/migrations/019-refactor_package_activity.sql new file mode 100644 index 00000000..09ce46b8 --- /dev/null +++ b/migrations/019-refactor_package_activity.sql @@ -0,0 +1,3 @@ +ALTER TABLE jetpack_package DROP COLUMN year_of_activity; +ALTER TABLE jetpack_package DROP COLUMN activity_updated_at; +ALTER TABLE jetpack_package ADD COLUMN activity_rating decimal(4,3); \ No newline at end of file