зеркало из https://github.com/mozilla/FlightDeck.git
Refactored Package Activity
Updated package activity to only be calculated during the nightly cron. (update_package_activity) Activity is stored in Package.activity and gets picked up and indexed on refresh_index
This commit is contained in:
Родитель
ba92ead8b9
Коммит
890e38996e
|
@ -8,9 +8,15 @@ from django.conf import settings
|
|||
import commonware
|
||||
import cronjobs
|
||||
|
||||
from .models import Package
|
||||
from . import tasks
|
||||
|
||||
from jetpack.models import Package
|
||||
from jetpack import tasks
|
||||
|
||||
from celery.messaging import establish_connection
|
||||
from celeryutils import chunked
|
||||
from elasticutils import get_es
|
||||
|
||||
log = commonware.log.getLogger('f.cron')
|
||||
|
||||
def _prune_older_files(directory, age):
|
||||
|
@ -52,16 +58,10 @@ def gc():
|
|||
|
||||
|
||||
@cronjobs.register
|
||||
def package_activity():
|
||||
"""
|
||||
Collect all Packages, and update their daily_activity based
|
||||
on if they have been active today.
|
||||
|
||||
Should be run nightly.
|
||||
"""
|
||||
tasks.fill_package_activity.delay(full_year=False)
|
||||
|
||||
|
||||
@cronjobs.register
|
||||
def fill_package_activity():
|
||||
tasks.fill_package_activity.delay(full_year=True)
|
||||
def update_package_activity():
|
||||
"""Recalculates package activity rating for all packages"""
|
||||
ids = Package.objects.all().values_list('id', flat=True)
|
||||
log.info("Updating package activity for %s packages" % len(ids))
|
||||
with establish_connection() as conn:
|
||||
for chunk in chunked(ids, 100):
|
||||
tasks.calculate_activity_rating.apply_async(args=[chunk], connection=conn)
|
||||
|
|
|
@ -10,6 +10,7 @@ import tempfile
|
|||
import markdown
|
||||
import hashlib
|
||||
import codecs
|
||||
from decimal import Decimal, getcontext
|
||||
from copy import deepcopy
|
||||
|
||||
from django.core.exceptions import ObjectDoesNotExist, ValidationError
|
||||
|
@ -23,6 +24,7 @@ from django.core.urlresolvers import reverse
|
|||
from django.template.defaultfilters import slugify
|
||||
from django.conf import settings
|
||||
from django.utils.translation import ugettext as _
|
||||
from django.db import connection
|
||||
|
||||
from cuddlefish.preflight import vk_to_jid, jid_to_programid, my_b32encode
|
||||
from ecdsa import SigningKey, NIST256p
|
||||
|
@ -1468,10 +1470,9 @@ class Package(BaseModel, SearchMixin):
|
|||
active = models.BooleanField(default=True, blank=True)
|
||||
# deleted is the limbo state
|
||||
deleted = models.BooleanField(default=False, blank=True)
|
||||
|
||||
# activity
|
||||
year_of_activity = models.CharField(max_length=365, default='0'*365)
|
||||
activity_updated_at = models.DateTimeField(null=True, blank=True)
|
||||
|
||||
#package activity score
|
||||
activity_rating = models.DecimalField(default=0.0, max_digits=4, decimal_places=3)
|
||||
|
||||
class Meta:
|
||||
" Set the ordering of objects "
|
||||
|
@ -1781,42 +1782,48 @@ class Package(BaseModel, SearchMixin):
|
|||
if self.version_name:
|
||||
self.version_name = alphanum_plus(self.version_name)
|
||||
|
||||
def get_activity_rating(self):
|
||||
def calc_activity_rating(self):
|
||||
"""
|
||||
Build a weighted average based on activity from daily_activity
|
||||
and recency of that activity.
|
||||
Build a weighted average based on package revisions
|
||||
"""
|
||||
# slices are by week
|
||||
# first couple weeks are weighted high
|
||||
# rest of the weeks are super tiny points
|
||||
slice_size = 7
|
||||
slices = len(self.year_of_activity) / slice_size
|
||||
|
||||
weights = {
|
||||
'0': 0.20,
|
||||
'1': 0.15,
|
||||
'2': 0.10,
|
||||
'3': 0.05,
|
||||
}
|
||||
|
||||
remaining_percentage = 1.0 - sum(weights.values())
|
||||
standard_weight = remaining_percentage / (slices - len(weights.keys()))
|
||||
total = 0
|
||||
|
||||
for i in range(slices):
|
||||
# slice_ is like '1100101'
|
||||
slice_start = i * slice_size
|
||||
slice_end = slice_start + slice_size
|
||||
points = self.year_of_activity.count('1', slice_start, slice_end)
|
||||
|
||||
weight = weights.get(str(i), standard_weight)
|
||||
weighted_points = points * weight
|
||||
|
||||
total = total + weighted_points
|
||||
|
||||
rating = total / slice_size
|
||||
return rating
|
||||
|
||||
|
||||
getcontext().prec = 3
|
||||
|
||||
#update tests if you change this.
|
||||
weights = [
|
||||
{ 'start': 1, 'end': 7, 'weight': Decimal('0.30') },
|
||||
{ 'start': 8, 'end': 14, 'weight': Decimal('0.20') },
|
||||
{ 'start': 15, 'end': 21, 'weight': Decimal('0.15') },
|
||||
{ 'start': 22, 'end': 52, 'weight': Decimal('0.15') },
|
||||
{ 'start': 53, 'end': 365, 'weight': Decimal('0.20') }
|
||||
]
|
||||
|
||||
q = []
|
||||
|
||||
for idx, w in enumerate(weights):
|
||||
q.append("""
|
||||
SELECT count(Days)/{3}, {4} as Row FROM
|
||||
(SELECT count(*) as Days FROM jetpack_packagerevision
|
||||
WHERE
|
||||
package_id = {0} AND
|
||||
TO_DAYS(created_at) <= TO_DAYS(DATE_SUB(CURDATE(), INTERVAL {1} DAY)) AND
|
||||
TO_DAYS(created_at) >= TO_DAYS(DATE_SUB(CURDATE(), INTERVAL {2} DAY))
|
||||
group by package_id, TO_DAYS(created_at)) x
|
||||
""".format(self.id,w['start'],w['end'],w['end']+1 - w['start'], idx))
|
||||
|
||||
query = " UNION ".join(q)
|
||||
|
||||
|
||||
cursor = connection.cursor()
|
||||
cursor.execute(query)
|
||||
|
||||
result = Decimal('0')
|
||||
|
||||
for idx, val in enumerate([row[0] for row in cursor.fetchall()]):
|
||||
result += weights[idx]['weight'] * val
|
||||
|
||||
return result
|
||||
|
||||
|
||||
@es_required
|
||||
def refresh_index(self, es, bulk=False):
|
||||
|
@ -1830,10 +1837,14 @@ class Package(BaseModel, SearchMixin):
|
|||
.exclude(package=self)
|
||||
.values_list('package_id', flat=True)))
|
||||
data['copies_count'] = len(data['copies'])
|
||||
|
||||
del data['year_of_activity']
|
||||
data['activity'] = self.get_activity_rating()
|
||||
|
||||
|
||||
log.debug(data)
|
||||
|
||||
# hack for ES, because a decimal is serialized as 'Decimal('0.302')'
|
||||
# so we must convert that to a float
|
||||
data['activity'] = float(self.activity_rating)
|
||||
del data['activity_rating']
|
||||
|
||||
try:
|
||||
if self.latest:
|
||||
deps = self.latest.dependencies.all()
|
||||
|
|
|
@ -2,47 +2,22 @@ import datetime
|
|||
import commonware.log
|
||||
from celery.decorators import task
|
||||
|
||||
from .models import Package
|
||||
from jetpack.models import Package
|
||||
from elasticutils import get_es
|
||||
|
||||
log = commonware.log.getLogger('f.celery')
|
||||
|
||||
|
||||
@task
|
||||
def fill_package_activity(full_year=False, *args, **kwargs):
|
||||
"""
|
||||
Collect all the revisions for each package, distinct by day, in the past year
|
||||
and determine the year of activity.
|
||||
"""
|
||||
log.info('Inserting data into year_of_activity.')
|
||||
pkgs = Package.objects.filter(deleted=False)
|
||||
now = datetime.datetime.utcnow()
|
||||
year = 365
|
||||
last_year = now - datetime.timedelta(year)
|
||||
|
||||
for pkg in pkgs:
|
||||
if full_year or not pkg.activity_updated_at:
|
||||
days = year
|
||||
time_since = last_year
|
||||
else:
|
||||
time_since = pkg.activity_updated_at
|
||||
days = (now - time_since).days
|
||||
|
||||
if days <= 0:
|
||||
continue
|
||||
|
||||
revs = (pkg.revisions.filter(created_at__gte=time_since)
|
||||
.order_by('-created_at'))
|
||||
|
||||
activity = list('0'*days)
|
||||
|
||||
for rev in revs:
|
||||
day = (now - rev.created_at).days
|
||||
activity[day] = '1'
|
||||
|
||||
|
||||
activity = ''.join(activity)
|
||||
pkg.year_of_activity = activity + pkg.year_of_activity[:-days]
|
||||
pkg.activity_updated_at = now
|
||||
pkg.save()
|
||||
|
||||
log.info('Finished filling data into year_of_activity.')
|
||||
|
||||
def calculate_activity_rating(pks,**kw):
|
||||
ids_str = ','.join(map(str, pks))
|
||||
log.debug('ES starting calculate_activity_rating for packages: [%s]'
|
||||
% ids_str)
|
||||
|
||||
for package in Package.objects.filter(pk__in=pks):
|
||||
package.activity_rating = package.calc_activity_rating()
|
||||
package.save()
|
||||
|
||||
log.debug('ES completed calculate_activity_rating for packages: [%s]'
|
||||
% ids_str)
|
||||
|
|
@ -1,6 +1,7 @@
|
|||
import os
|
||||
import datetime
|
||||
import commonware
|
||||
from decimal import Decimal
|
||||
|
||||
from test_utils import TestCase
|
||||
from nose import SkipTest
|
||||
|
@ -332,14 +333,40 @@ class PackageTest(TestCase):
|
|||
addon_saved = Package.objects.get(author=self.author, type='a')
|
||||
eq_(addon_saved.description, description)
|
||||
|
||||
def test_activity_rating(self):
|
||||
pack = Package()
|
||||
def test_activity_rating_calculation_one_year(self):
|
||||
addon = Package.objects.create(author=self.author, type='a')
|
||||
|
||||
eq_(0, addon.calc_activity_rating())
|
||||
|
||||
pack.year_of_activity = '0' * 365
|
||||
eq_(pack.get_activity_rating(), 0)
|
||||
now = datetime.datetime.utcnow()
|
||||
|
||||
for i in range(1,366):
|
||||
r = addon.revisions.create(author=self.author, revision_number=i)
|
||||
r.created_at=now-datetime.timedelta(i)
|
||||
super(PackageRevision, r).save()
|
||||
|
||||
#created packages, including initial
|
||||
eq_(366, addon.revisions.count())
|
||||
eq_(Decimal('1'), addon.calc_activity_rating())
|
||||
|
||||
def test_activity_rating_calculation_first_week(self):
|
||||
addon = Package(type='a', author=self.author)
|
||||
addon.save()
|
||||
|
||||
pack.year_of_activity = '1' * 365
|
||||
eq_(pack.get_activity_rating() > 0.99, True)
|
||||
now = datetime.datetime.utcnow()
|
||||
|
||||
# Create 1 weeks worth of revisions... should equal .30 of score
|
||||
# see models.py def Packages for weights
|
||||
|
||||
for i in range(1,8):
|
||||
r = addon.revisions.create(author=self.author, revision_number=i)
|
||||
r.created_at=now-datetime.timedelta(i)
|
||||
super(PackageRevision, r).save()
|
||||
|
||||
eq_(8, addon.revisions.count())
|
||||
|
||||
eq_(Decimal('0.300'), addon.calc_activity_rating())
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -2,6 +2,7 @@ import commonware
|
|||
import tempfile
|
||||
import os
|
||||
import datetime
|
||||
import decimal
|
||||
|
||||
from test_utils import TestCase
|
||||
|
||||
|
@ -10,10 +11,11 @@ from nose.tools import eq_
|
|||
from django.contrib.auth.models import User
|
||||
from django.conf import settings
|
||||
|
||||
from jetpack.tasks import calculate_activity_rating
|
||||
from jetpack.models import Package, PackageRevision, Module, Attachment, SDK
|
||||
from jetpack.errors import SelfDependencyException, FilenameExistException, \
|
||||
DependencyException
|
||||
from jetpack.tasks import fill_package_activity
|
||||
|
||||
from base.templatetags.base_helpers import hashtag
|
||||
|
||||
log = commonware.log.getLogger('f.test')
|
||||
|
@ -391,48 +393,27 @@ class PackageRevisionTest(TestCase):
|
|||
assert old_rev.name
|
||||
assert old_package.full_name
|
||||
assert old_package.name
|
||||
|
||||
def test_fill_package_activity(self):
|
||||
orig = '0'*365
|
||||
|
||||
def test_update_package_activity_cron(self):
|
||||
addon = Package(type='a', author=self.author)
|
||||
addon.save()
|
||||
|
||||
eq_(addon.year_of_activity, orig)
|
||||
|
||||
fill_package_activity.delay()
|
||||
|
||||
addon = Package.objects.get(pk=addon.pk)
|
||||
|
||||
new = '1' + orig[:-1]
|
||||
eq_(addon.year_of_activity, new)
|
||||
|
||||
def test_package_activity_cron(self):
|
||||
addon = Package(type='a', author=self.author)
|
||||
addon.save()
|
||||
fill_package_activity.delay(full_year=True)
|
||||
|
||||
now = datetime.datetime.utcnow()
|
||||
|
||||
# Superficially creating revisions in the past
|
||||
r2 = addon.revisions.create(author=self.author, revision_number=2)
|
||||
r2.created_at=now-datetime.timedelta(5)
|
||||
super(PackageRevision, r2).save()
|
||||
|
||||
r3 = addon.revisions.create(author=self.author, revision_number=3)
|
||||
r3.created_at=now-datetime.timedelta(3)
|
||||
super(PackageRevision, r3).save()
|
||||
|
||||
# Create 1 weeks worth of revisions... should equal .30 of score
|
||||
# see models.py def Packages for weights
|
||||
|
||||
for i in range(1,8):
|
||||
r = addon.revisions.create(author=self.author, revision_number=i)
|
||||
r.created_at=now-datetime.timedelta(i)
|
||||
super(PackageRevision, r).save()
|
||||
|
||||
#run task on this one package
|
||||
calculate_activity_rating([addon.pk])
|
||||
|
||||
addon = Package.objects.get(pk=addon.pk)
|
||||
addon.activity_updated_at = now - datetime.timedelta(4)
|
||||
addon.save()
|
||||
|
||||
|
||||
old = addon.year_of_activity
|
||||
fill_package_activity.delay(full_year=False)
|
||||
|
||||
addon = Package.objects.get(pk=addon.pk)
|
||||
|
||||
eq_('1001'+old[:-4], addon.year_of_activity)
|
||||
|
||||
eq_(addon.activity_rating, addon.calc_activity_rating())
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -22,6 +22,10 @@ def index_all():
|
|||
for chunk in chunked(ids, 100):
|
||||
tasks.index_all.apply_async(args=[chunk], connection=conn)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@cronjobs.register
|
||||
def setup_mapping():
|
||||
"""Create index, and setup mapping, for ES."""
|
||||
|
|
|
@ -73,13 +73,16 @@ def _get_average_activity():
|
|||
return average
|
||||
# TODO: ES has statistical facet that can provide average, but I couldn't
|
||||
# get it working.
|
||||
qs = Package.search().filter(activity__gt=0)
|
||||
|
||||
qs = Package.search().filter(activity__gt=0.001)
|
||||
values = qs.values('activity')[:qs.count()]
|
||||
|
||||
num = len(values)
|
||||
|
||||
if num > 0:
|
||||
average = sum(v[1] for v in values) / num
|
||||
else:
|
||||
average = 0.2
|
||||
|
||||
|
||||
cache.set(ACTIVITY_CACHE_KEY, average, 60*60*24)
|
||||
return average
|
||||
|
|
|
@ -24,7 +24,11 @@ def index_all(pks, **kw):
|
|||
else:
|
||||
log.debug('ES finished bulk action for packages: [%s]' % ids_str)
|
||||
|
||||
|
||||
@task
|
||||
def index_one(pk, **kw):
|
||||
package = Package.objects.get(pk=pk)
|
||||
package.refresh_index()
|
||||
package.refresh_index()
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
ALTER TABLE jetpack_package DROP COLUMN year_of_activity;
|
||||
ALTER TABLE jetpack_package DROP COLUMN activity_updated_at;
|
||||
ALTER TABLE jetpack_package ADD COLUMN activity_rating decimal(4,3);
|
Загрузка…
Ссылка в новой задаче