Merge pull request #41 from pennyfx/package_activity

Package activity
This commit is contained in:
Arron Schaar 2011-10-17 14:21:42 -07:00
Родитель efc62d88a2 e1b0643508
Коммит 177d1e453f
13 изменённых файлов: 204 добавлений и 146 удалений

Просмотреть файл

@ -8,9 +8,15 @@ from django.conf import settings
import commonware
import cronjobs
from .models import Package
from . import tasks
from jetpack.models import Package
from jetpack import tasks
from celery.messaging import establish_connection
from celeryutils import chunked
from elasticutils import get_es
log = commonware.log.getLogger('f.cron')
def _prune_older_files(directory, age):
@ -52,16 +58,10 @@ def gc():
@cronjobs.register
def package_activity():
"""
Collect all Packages, and update their daily_activity based
on if they have been active today.
Should be run nightly.
"""
tasks.fill_package_activity.delay(full_year=False)
@cronjobs.register
def fill_package_activity():
tasks.fill_package_activity.delay(full_year=True)
def update_package_activity():
"""Recalculates package activity rating for all packages"""
ids = Package.objects.all().values_list('id', flat=True)
log.info("Updating package activity for %s packages" % len(ids))
with establish_connection() as conn:
for chunk in chunked(ids, 100):
tasks.calculate_activity_rating.apply_async(args=[chunk], connection=conn)

Просмотреть файл

@ -10,6 +10,7 @@ import tempfile
import markdown
import hashlib
import codecs
from decimal import Decimal, getcontext
from copy import deepcopy
from django.core.exceptions import ObjectDoesNotExist, ValidationError
@ -23,6 +24,7 @@ from django.core.urlresolvers import reverse
from django.template.defaultfilters import slugify
from django.conf import settings
from django.utils.translation import ugettext as _
from django.db import connection
from cuddlefish.preflight import vk_to_jid, jid_to_programid, my_b32encode
from ecdsa import SigningKey, NIST256p
@ -1468,10 +1470,9 @@ class Package(BaseModel, SearchMixin):
active = models.BooleanField(default=True, blank=True)
# deleted is the limbo state
deleted = models.BooleanField(default=False, blank=True)
# activity
year_of_activity = models.CharField(max_length=365, default='0'*365)
activity_updated_at = models.DateTimeField(null=True, blank=True)
#package activity score
activity_rating = models.DecimalField(default=0.0, max_digits=4, decimal_places=3)
class Meta:
" Set the ordering of objects "
@ -1781,42 +1782,48 @@ class Package(BaseModel, SearchMixin):
if self.version_name:
self.version_name = alphanum_plus(self.version_name)
def get_activity_rating(self):
def calc_activity_rating(self):
"""
Build a weighted average based on activity from daily_activity
and recency of that activity.
Build a weighted average based on package revisions
"""
# slices are by week
# first couple weeks are weighted high
# rest of the weeks are super tiny points
slice_size = 7
slices = len(self.year_of_activity) / slice_size
weights = {
'0': 0.20,
'1': 0.15,
'2': 0.10,
'3': 0.05,
}
remaining_percentage = 1.0 - sum(weights.values())
standard_weight = remaining_percentage / (slices - len(weights.keys()))
total = 0
for i in range(slices):
# slice_ is like '1100101'
slice_start = i * slice_size
slice_end = slice_start + slice_size
points = self.year_of_activity.count('1', slice_start, slice_end)
weight = weights.get(str(i), standard_weight)
weighted_points = points * weight
total = total + weighted_points
rating = total / slice_size
return rating
getcontext().prec = 3
#update tests if you change this.
weights = [
{ 'start': 1, 'end': 7, 'weight': Decimal('0.30') },
{ 'start': 8, 'end': 14, 'weight': Decimal('0.20') },
{ 'start': 15, 'end': 21, 'weight': Decimal('0.15') },
{ 'start': 22, 'end': 52, 'weight': Decimal('0.15') },
{ 'start': 53, 'end': 365, 'weight': Decimal('0.20') }
]
q = []
for idx, w in enumerate(weights):
q.append("""
SELECT count(Days)/{3}, {4} as Row FROM
(SELECT count(*) as Days FROM jetpack_packagerevision
WHERE
package_id = {0} AND
TO_DAYS(created_at) <= TO_DAYS(DATE_SUB(CURDATE(), INTERVAL {1} DAY)) AND
TO_DAYS(created_at) >= TO_DAYS(DATE_SUB(CURDATE(), INTERVAL {2} DAY))
group by package_id, TO_DAYS(created_at)) x
""".format(self.id,w['start'],w['end'],w['end']+1 - w['start'], idx))
query = " UNION ".join(q)
cursor = connection.cursor()
cursor.execute(query)
result = Decimal('0')
for idx, val in enumerate([row[0] for row in cursor.fetchall()]):
result += weights[idx]['weight'] * val
return result
@es_required
def refresh_index(self, es, bulk=False):
@ -1830,10 +1837,12 @@ class Package(BaseModel, SearchMixin):
.exclude(package=self)
.values_list('package_id', flat=True)))
data['copies_count'] = len(data['copies'])
del data['year_of_activity']
data['activity'] = self.get_activity_rating()
# hack for ES, because a decimal is serialized as 'Decimal('0.302')'
# so we must convert that to a float
data['activity'] = float(self.activity_rating)
del data['activity_rating']
try:
if self.latest:
deps = self.latest.dependencies.all()

Просмотреть файл

@ -2,47 +2,22 @@ import datetime
import commonware.log
from celery.decorators import task
from .models import Package
from jetpack.models import Package
from elasticutils import get_es
log = commonware.log.getLogger('f.celery')
@task
def fill_package_activity(full_year=False, *args, **kwargs):
"""
Collect all the revisions for each package, distinct by day, in the past year
and determine the year of activity.
"""
log.info('Inserting data into year_of_activity.')
pkgs = Package.objects.filter(deleted=False)
now = datetime.datetime.utcnow()
year = 365
last_year = now - datetime.timedelta(year)
for pkg in pkgs:
if full_year or not pkg.activity_updated_at:
days = year
time_since = last_year
else:
time_since = pkg.activity_updated_at
days = (now - time_since).days
if days <= 0:
continue
revs = (pkg.revisions.filter(created_at__gte=time_since)
.order_by('-created_at'))
activity = list('0'*days)
for rev in revs:
day = (now - rev.created_at).days
activity[day] = '1'
activity = ''.join(activity)
pkg.year_of_activity = activity + pkg.year_of_activity[:-days]
pkg.activity_updated_at = now
pkg.save()
log.info('Finished filling data into year_of_activity.')
def calculate_activity_rating(pks,**kw):
ids_str = ','.join(map(str, pks))
log.debug('ES starting calculate_activity_rating for packages: [%s]'
% ids_str)
for package in Package.objects.filter(pk__in=pks):
package.activity_rating = package.calc_activity_rating()
package.save()
log.debug('ES completed calculate_activity_rating for packages: [%s]'
% ids_str)

Просмотреть файл

@ -1,6 +1,7 @@
import os
import datetime
import commonware
from decimal import Decimal
from test_utils import TestCase
from nose import SkipTest
@ -332,14 +333,40 @@ class PackageTest(TestCase):
addon_saved = Package.objects.get(author=self.author, type='a')
eq_(addon_saved.description, description)
def test_activity_rating(self):
pack = Package()
def test_activity_rating_calculation_one_year(self):
addon = Package.objects.create(author=self.author, type='a')
eq_(0, addon.calc_activity_rating())
pack.year_of_activity = '0' * 365
eq_(pack.get_activity_rating(), 0)
now = datetime.datetime.utcnow()
for i in range(1,366):
r = addon.revisions.create(author=self.author, revision_number=i)
r.created_at=now-datetime.timedelta(i)
super(PackageRevision, r).save()
#created packages, including initial
eq_(366, addon.revisions.count())
eq_(Decimal('1'), addon.calc_activity_rating())
def test_activity_rating_calculation_first_week(self):
addon = Package(type='a', author=self.author)
addon.save()
pack.year_of_activity = '1' * 365
eq_(pack.get_activity_rating() > 0.99, True)
now = datetime.datetime.utcnow()
# Create 1 weeks worth of revisions... should equal .30 of score
# see models.py def Packages for weights
for i in range(1,8):
r = addon.revisions.create(author=self.author, revision_number=i)
r.created_at=now-datetime.timedelta(i)
super(PackageRevision, r).save()
eq_(8, addon.revisions.count())
eq_(Decimal('0.300'), addon.calc_activity_rating())

Просмотреть файл

@ -2,6 +2,7 @@ import commonware
import tempfile
import os
import datetime
import decimal
from test_utils import TestCase
@ -10,10 +11,11 @@ from nose.tools import eq_
from django.contrib.auth.models import User
from django.conf import settings
from jetpack.tasks import calculate_activity_rating
from jetpack.models import Package, PackageRevision, Module, Attachment, SDK
from jetpack.errors import SelfDependencyException, FilenameExistException, \
DependencyException
from jetpack.tasks import fill_package_activity
from base.templatetags.base_helpers import hashtag
log = commonware.log.getLogger('f.test')
@ -391,48 +393,27 @@ class PackageRevisionTest(TestCase):
assert old_rev.name
assert old_package.full_name
assert old_package.name
def test_fill_package_activity(self):
orig = '0'*365
def test_update_package_activity_cron(self):
addon = Package(type='a', author=self.author)
addon.save()
eq_(addon.year_of_activity, orig)
fill_package_activity.delay()
addon = Package.objects.get(pk=addon.pk)
new = '1' + orig[:-1]
eq_(addon.year_of_activity, new)
def test_package_activity_cron(self):
addon = Package(type='a', author=self.author)
addon.save()
fill_package_activity.delay(full_year=True)
now = datetime.datetime.utcnow()
# Superficially creating revisions in the past
r2 = addon.revisions.create(author=self.author, revision_number=2)
r2.created_at=now-datetime.timedelta(5)
super(PackageRevision, r2).save()
r3 = addon.revisions.create(author=self.author, revision_number=3)
r3.created_at=now-datetime.timedelta(3)
super(PackageRevision, r3).save()
# Create 1 weeks worth of revisions... should equal .30 of score
# see models.py def Packages for weights
for i in range(1,8):
r = addon.revisions.create(author=self.author, revision_number=i)
r.created_at=now-datetime.timedelta(i)
super(PackageRevision, r).save()
#run task on this one package
calculate_activity_rating([addon.pk])
addon = Package.objects.get(pk=addon.pk)
addon.activity_updated_at = now - datetime.timedelta(4)
addon.save()
old = addon.year_of_activity
fill_package_activity.delay(full_year=False)
addon = Package.objects.get(pk=addon.pk)
eq_('1001'+old[:-4], addon.year_of_activity)
eq_(addon.activity_rating, addon.calc_activity_rating())

Просмотреть файл

@ -22,6 +22,10 @@ def index_all():
for chunk in chunked(ids, 100):
tasks.index_all.apply_async(args=[chunk], connection=conn)
@cronjobs.register
def setup_mapping():
"""Create index, and setup mapping, for ES."""

Просмотреть файл

@ -73,13 +73,16 @@ def _get_average_activity():
return average
# TODO: ES has statistical facet that can provide average, but I couldn't
# get it working.
qs = Package.search().filter(activity__gt=0)
qs = Package.search().filter(activity__gt=0.001)
values = qs.values('activity')[:qs.count()]
num = len(values)
if num > 0:
average = sum(v[1] for v in values) / num
else:
average = 0.2
cache.set(ACTIVITY_CACHE_KEY, average, 60*60*24)
return average

Просмотреть файл

@ -24,7 +24,11 @@ def index_all(pks, **kw):
else:
log.debug('ES finished bulk action for packages: [%s]' % ids_str)
@task
def index_one(pk, **kw):
package = Package.objects.get(pk=pk)
package.refresh_index()
package.refresh_index()

Просмотреть файл

@ -78,6 +78,12 @@ If this is a brand new installation you'll need to configure a database as
well. This command will build the structure::
./manage.py syncdb
If you're using Elastic Search locally then be sure to setup the ES index
mappings and index all your packages
./manage.py cron setup_mapping
./manage.py cron index_all
FlightDeck needs to know about all the SDKs you have availalbe. This command
will make it look for them and initialize the database::
@ -194,3 +200,47 @@ An example Apache WSGI configuration::
import django.core.handlers.wsgi
application = django.core.handlers.wsgi.WSGIHandler()
Recipes
===============
Import live database dump
-------------------------
How to import a database dump from live
[sudo] mysql flightdeck < flightdeck_dump.sql
If you run into an error when importing large sql dump files, you may need to
restart your mysqld process with this parameter.
mysqld --max_allowed_packet=32M
The database dump might be missing a row in django_sites table, so if you get a
django error saying "Site matching query does not exist" when you hit the login
page then insert a row into django_site.
insert into django_site (id,domain,name) values (1,'example.com','example')
After importing the data, you will need to rebuild your ES index.
Rebuilding Elastic Search index
-------------------------------
Need to delete your Elastic Search index and start over?
curl -XDELETE 'http://localhost:9201/flightdeck'
./manage.py cron setup_mapping
./manage.py cron index_all
Create a local super user account
---------------------------------
If you imported your database then you will need to create a user.
./manage.py createsuperuser

Просмотреть файл

@ -0,0 +1,6 @@
ALTER TABLE jetpack_package DROP COLUMN year_of_activity;
ALTER TABLE jetpack_package DROP COLUMN activity_updated_at;
ALTER TABLE jetpack_package ADD COLUMN activity_rating decimal(4,3);
-- this index is intended to speed up the package activity queries.
CREATE INDEX package_id_created_at ON jetpack_packagerevision (package_id, created_at DESC);

Просмотреть файл

@ -44,7 +44,7 @@ MAILTO=flightdeck-developers@mozilla.org
#once per day
30 1 * * * $F_CRON gc
30 2 * * * $F_CRON package_activity
30 2 * * * $F_CRON update_package_activity
#every hour
30 * * * * $F_CRON celery

Просмотреть файл

@ -89,7 +89,6 @@ def update_flightdeck(ctx):
# Run management commands like this:
# manage_cmd(ctx, 'cmd')
# For 0.9.15 only
manage_cmd(ctx, 'cron fill_package_activity')
manage_cmd(ctx, 'cron update_package_activity')

2
vendor

@ -1 +1 @@
Subproject commit 6d3f6f6940266de416d932fb3c109fd785c21dfe
Subproject commit b955928b1540457397bb9f65753825be4a2f5458