This commit is contained in:
Shuhao Wu 2013-06-03 14:24:36 -07:00 коммит произвёл Mike Cooper
Родитель 62a5803df0
Коммит cf8ccaa583
17 изменённых файлов: 1152 добавлений и 0 удалений

Просмотреть файл

@ -33,6 +33,7 @@ Part 2: Developer's Guide
karma karma
vendor vendor
wikidocs wikidocs
osumo
notes notes
licenses licenses

23
docs/osumo.rst Normal file
Просмотреть файл

@ -0,0 +1,23 @@
.. _osumo-chapter:
============
Offline SUMO
============
The primary documentation for offline sumo lives here:
https://osumo.readthedocs.org. The source lives at
https://gihub.com/mozilla/osumo.
Offline SUMO requires a component on Kitsune and this component relies heavily
on Redis as we generate all the articles once a day and put it into
Kitsune. Make sure that is available.
The code for offline sumo's bundle generation lives under
`kitsune/offline`. Inside, there are a couple of files defined:
- utils.py does the actual bundle generation.
- index.py is responsible for generating the index for offline search.
- urls.py defines the url for Django.
- views.py implements the two views.
- cron.py has the cron job that runs daily for bundle generation.
- tests/ has unittests.

Просмотреть файл

72
kitsune/offline/admin.py Normal file
Просмотреть файл

@ -0,0 +1,72 @@
import datetime
import logging
from django.conf import settings
from django.contrib import admin, messages
from django.shortcuts import render
from kitsune.offline.cron import build_kb_bundles
from kitsune.sumo.redis_utils import redis_client
log = logging.getLogger('k.offline')
def offline_admin(request):
redis = redis_client('default')
action = request.POST.get('action')
if action == 'generate_all':
log.info('Requested regenerating all bundles.')
build_kb_bundles()
messages.add_message(request, messages.SUCCESS,
'Bundles regenerated!')
elif action == 'delete_all':
if redis.delete(*redis.keys('osumo:*')):
messages.add_message(request, messages.SUCCESS,
'Deleted all bundles!')
else:
messages.add_message(request, messages.ERROR,
'Bundle deleting failed.')
keys = redis.keys('osumo:*')
bundles = []
totalsize = 0
for key in keys:
bundle = {}
# reverse operation to redis_bundle_name, the schema is:
# osumo:locale~product
tmp = key.split(':')[1].split('~')
locale, bundle['product'] = tuple(tmp)
# to get the non .lower()'ed version.
locale = settings.LANGUAGE_URL_MAP[locale]
bundle['locale'] = settings.LOCALES[locale].english
bundle['hash'] = redis.hget(key, 'hash')
updated = float(redis.hget(key, 'updated'))
updated = datetime.datetime.fromtimestamp(updated)
bundle['updated'] = updated.strftime('%Y-%m-%d %H:%M:%S')
bundle['size'] = round(len(redis.hget(key, 'bundle')) / 1024.0, 2)
totalsize += bundle['size']
bundles.append(bundle)
# Sorting by by locale and then product
bundles.sort(key=lambda x: x['locale'] + x['product'])
totalsize /= 1024
totalsize = round(totalsize, 2)
return render(request,
'admin/offline.html',
{'title': 'Offline SUMO Administration',
'bundles': bundles,
'totalsize': totalsize})
admin.site.register_view('offline',
offline_admin,
'Offline SUMO Administration')

46
kitsune/offline/cron.py Normal file
Просмотреть файл

@ -0,0 +1,46 @@
import logging
import time
from django.conf import settings
from cronjobs import register
from statsd import statsd
from kitsune.offline.utils import (
bundle_for_product,
merge_bundles,
insert_bundle_into_redis
)
from kitsune.products.models import Product
from kitsune.sumo.utils import uselocale
from kitsune.sumo.redis_utils import redis_client
log = logging.getLogger('k.offline')
@register
def build_kb_bundles(products=('firefox-os', 'firefox', 'mobile')):
redis = redis_client('default')
if not redis:
raise IOError('Redis not available. Cannot generate offline bundles.')
start_time = time.time()
size = 0
products = [Product.objects.get(slug=p) for p in products]
with statsd.timer('offline.build_kb_bundles.time_elapsed'):
for locale in settings.SUMO_LANGUAGES:
for product in products:
with uselocale(locale):
bundle = merge_bundles(bundle_for_product(product, locale))
size += len(insert_bundle_into_redis(redis,
product.slug,
locale,
bundle)[0])
time_taken = time.time() - start_time
log.info('Generated all offline bundles. '
'Size: {0}. Took {1} seconds'.format(size, time_taken))

171
kitsune/offline/index.py Normal file
Просмотреть файл

@ -0,0 +1,171 @@
# -*- coding: utf-8 -*-
from __future__ import division
import math
import string
import re
_whitespace_regex = re.compile(r'\s|-', flags=re.U)
_alpha_regex = re.compile(r'\w', flags=re.U)
def find_word_locations_with_spaces(s):
"""Builds an index in the format of {word: location}.
This is an English like search. For languages without spaces to
separate words, use find_word_locations_without_spaces.
This is a futureproof function. If we need to add location based indexing
for better searches with multiple search terms (especially for languages
like Chinese, Japanese, and Korean), we need to find each words index.
In this routine, we separate words at end of sentences by 2 as a gap and by
1 if words are separated by a comma (or alike).
Right now, the routine is only used to get the words count in TFIDFIndex.
"""
s = s.lower()
words = [u'']
for c in s:
if c in '\'"[]1234567890/\\()_':
continue
elif c in '.!?': # We want to treat . as a big stop. Add two space.
words.append(u'')
words.append(u'')
elif _whitespace_regex.match(c) or c in string.punctuation:
words.append(u'')
elif _alpha_regex.match(c) is not None:
words[-1] += c
else:
# characters that we don't care about such as a control character.
# It's okay if we skip it.
continue
locations = {}
for i, w in enumerate(words):
if w:
locations.setdefault(w, []).append(i)
return locations
def find_word_locations_without_spaces(s):
"""Builds an index of the format of {word: location}.
This method is for languages like Chinese where there is no spaces
to denote the beginning and end of a word.
"""
words = [u'']
for c in s:
if c in u'\'"[]1234567890/\\()_【】『』、¥《》”“':
continue
# This is at least the punctuations in Chinese.
elif c in u'。!?':
words.append(u'')
words.append(u'')
# Yes, east asian languages could still have white space.
elif _whitespace_regex.match(c) or c in u";:,、" + string.punctuation:
words.append(u'')
elif _alpha_regex.match(c) is not None:
words.append(c)
else:
# Something weird, but it is totally okay.
# this character is probably not significant (maybe invisble)
continue
locations = {}
for i, w in enumerate(words):
if w:
locations.setdefault(w, []).append(i)
return locations
class TFIDFIndex(object):
"""This is an index for search and ranking based on TF-IDF.
TF-IDF (Term Frequency - Inverse Document Frequency) is a relatively
simple and intuitive NLP technique that scores words in a document
given a corpus based on how important this word is.
A full explanation of this is provided at
http://osumo.readthedocs.org/en/latest/offlinesearch.html#index-structure.
"""
def __init__(self):
self.doc_count = 0
self.global_word_freq = {}
self.local_word_freq = {}
self.docs_words_boosts = {}
def feed(self, doc_id, texts, get_locations):
self.doc_count += 1
self.local_word_freq.setdefault(doc_id, {})
self.docs_words_boosts.setdefault(doc_id, {})
for text, boost in texts:
locations = get_locations(text)
for w, loc in locations.iteritems():
global_freq = self.global_word_freq.setdefault(w, 0)
local_freq = len(loc)
self.global_word_freq[w] = global_freq + local_freq
old_local_freq = self.local_word_freq[doc_id].setdefault(w, 0)
self.local_word_freq[doc_id][w] = old_local_freq + local_freq
boost = max(self.docs_words_boosts[doc_id].get(w, 0), boost)
if boost != 1: # save some space..
self.docs_words_boosts[doc_id][w] = boost
def _f(self, term, doc_id):
"""The frequency of a certain term in a certain document."""
return self.local_word_freq[doc_id][term]
def _tf(self, term, doc_id):
"""The term frequency term of the TF-IDF formula.
Adapted from Wikipedia:
tf(t, d) = 0.5 + \\frac{0.5 f(t, d)}{max(f(w, d), w \in d)}
"""
o = self._f(term, doc_id) / max(self.local_word_freq[doc_id].values())
return 0.5 + (0.5 * o)
def _idf(self, term):
"""The inverse document frequency term from the TF-IDF formula.
Adapted from Wikipedia.
idf(t, D) = \log \\frac{|D|}{|{d \in D : t \in D}|}
"""
appearance = 0
for doc_id, words in self.local_word_freq.iteritems():
appearance += 1 if term in words else 0
return math.log(self.doc_count / appearance, 2)
def tfidf(self, term, doc_id):
"""The whole formula together for TF-IDF.
Adapted from Wikipedia.
"""
boost = self.docs_words_boosts[doc_id].get(term, 1)
return self._tf(term, doc_id) * self._idf(term) * boost
def tfidf_doc(self, doc_id):
"""Computes the TF-IDF score for each term in a document."""
doc = self.local_word_freq[doc_id]
scores = []
for word in doc:
scores.append((word, round(self.tfidf(word, doc_id), 2)))
scores.sort(key=lambda x: x[1], reverse=True)
return scores
def offline_index(self):
"""Builds the offline index."""
index = {}
for doc_id in self.local_word_freq:
scores = self.tfidf_doc(doc_id)
for word, score in scores:
l = index.setdefault(word, [])
l.append((doc_id, score))
return index

Просмотреть файл

@ -0,0 +1,59 @@
{% extends "kadmin/base.html" %}
{% block content_title %}
<h1>Offline SUMO Administration</h1>
{% endblock %}
{% block content %}
<section>
<h2>Currently available bundles</h2>
{% if bundles %}
<table>
<thead>
<tr>
<th>Locale</th>
<th>Product</th>
<th>Bundle Hash</th>
<th>Last updated (server time)</th>
<th>Size</th>
</tr>
</thead>
<tbody>
{% for bundle in bundles %}
<tr>
<td>{{ bundle.locale }}</td>
<td>{{ bundle.product }}</td>
<td>{{ bundle.hash }}</td>
<td>{{ bundle.updated }}</td>
<td>{{ bundle.size }} KB</td>
</tr>
{% endfor %}
<tr>
<td><strong>All</strong></td>
<td><strong>All</strong></td>
<td> --- </td>
<td> --- </td>
<td><strong>{{ totalsize }} MB</strong></td>
</tr>
</tbody>
</table>
<p>Note sizes are raw sizes of the JSON. May not reflect the actual size in Redis or the ones downloaded due to compression.</p>
{% else %}
<p>No bundles are in Redis. Please generate them.</p>
{% endif %}
</section>
<section>
<h2>Database administrations</h2>
<form method="POST">
{% csrf_token %}
<input type="hidden" name="action" value="generate_all" />
<input type="submit" value="Regenerate all bundles (This may take a while)" />
</form>
<form method="POST">
{% csrf_token %}
<input type="hidden" name="action" value="delete_all" />
<input type="submit" value="Delete all bundles" />
</form>
</section>
{% endblock %}

Просмотреть файл

Просмотреть файл

@ -0,0 +1,298 @@
# -*- coding: utf-8 -*-
import time
from nose.tools import eq_
from kitsune.offline import utils
from kitsune.products.tests import product, topic
from kitsune.sumo.tests import TestCase
from kitsune.wiki.tests import document, revision
def _create_doc(title='', product=None, topic=None, is_archived=False):
title = 'test ' + title if title else 'test'
doc = document(title=title, save=True, is_archived=is_archived)
revision(summary='summary', is_approved=True, document=doc, save=True)
if is_archived:
expected = {
'key': 'en-US~' + doc.slug,
'title': doc.title,
'archived': True,
'slug': doc.slug
}
else:
updated = time.mktime(doc.current_revision.created.timetuple())
expected = {
'key': 'en-US~' + doc.slug,
'title': title,
'html': doc.html,
'updated': updated,
'slug': doc.slug,
'id': doc.id,
'archived': False
}
if product:
doc.products.add(product)
if topic:
doc.topics.add(topic)
return doc, expected
def _create_product_bundle(prefix='moo'):
p = product(title=prefix + 'firefox', save=True)
t1 = topic(title=prefix + 'topic1', product=p, save=True)
t2 = topic(title=prefix + 'topic2', product=p, save=True)
doc1, expected_doc1 = _create_doc(title=prefix + 'doc1',
product=p, topic=t1)
doc2, expected_doc2 = _create_doc(title=prefix + 'doc2',
product=p, topic=t2)
expected_locale_doc = {
'key': u'en-US',
'name': u'English',
'products': [{
'slug': p.slug,
'name': p.title
}]
}
expected_topic1 = {
'key': 'en-US~' + p.slug + '~' + t1.slug,
'name': t1.title,
'docs': [doc1.slug],
'product': p.slug,
'slug': t1.slug,
'children': []
}
expected_topic2 = {
'key': 'en-US~' + p.slug + '~' + t2.slug,
'name': t2.title,
'docs': [doc2.slug],
'product': p.slug,
'slug': t2.slug,
'children': []
}
return p, {
'doc1': expected_doc1,
'doc2': expected_doc2,
'locale': expected_locale_doc,
'topic1': expected_topic1,
'topic2': expected_topic2
}
class OfflineWikiDataGenerationTest(TestCase):
def test_serialize_document(self):
doc, expected = _create_doc()
serialized = utils.serialize_document_for_offline(doc)
eq_(expected, serialized)
def test_serialized_archived_document(self):
doc, expected = _create_doc(is_archived=True)
serialized = utils.serialize_document_for_offline(doc)
eq_(expected, serialized)
def test_bundle_for_product(self):
p, expected_bundle = _create_product_bundle()
bundle = utils.bundle_for_product(p, 'en-US')
assert 'locales' in bundle
eq_(1, len(bundle['locales']))
eq_(expected_bundle['locale'], bundle['locales'].values()[0])
assert 'topics' in bundle
eq_(2, len(bundle['topics']))
topics = sorted(bundle['topics'].values(), key=lambda t: t['slug'])
eq_(expected_bundle['topic1'], topics[0])
eq_(expected_bundle['topic2'], topics[1])
assert 'docs' in bundle
docs = sorted(bundle['docs'].values(), key=lambda d: d['title'])
eq_(expected_bundle['doc1'], docs[0])
eq_(expected_bundle['doc2'], docs[1])
assert 'indexes' in bundle
eq_(1, len(bundle['indexes']))
assert 'en-US~moofirefox' in bundle['indexes']
assert 'index' in bundle['indexes']['en-US~moofirefox']
eq_(u'en-US~moofirefox', bundle['indexes']['en-US~moofirefox']['key'])
def test_merge_bundles(self):
p1, expected_bundle1 = _create_product_bundle()
p2, expected_bundle2 = _create_product_bundle('yay')
bundle1 = utils.bundle_for_product(p1, 'en-US')
bundle2 = utils.bundle_for_product(p2, 'en-US')
merged = utils.merge_bundles(bundle1, bundle2)
assert 'locales' in merged
eq_(1, len(merged['locales']))
expected_locale = expected_bundle1['locale']
expected_locale['products'] += expected_bundle2['locale']['products']
eq_(expected_locale, merged['locales'][0])
assert 'topics' in merged
eq_(4, len(merged['topics']))
merged['topics'].sort(key=lambda t: t['slug'])
eq_(expected_bundle1['topic1'], merged['topics'][0])
eq_(expected_bundle1['topic2'], merged['topics'][1])
eq_(expected_bundle2['topic1'], merged['topics'][2])
eq_(expected_bundle2['topic2'], merged['topics'][3])
assert 'docs' in merged
eq_(4, len(merged['docs']))
merged['docs'].sort(key=lambda d: d['title'])
eq_(expected_bundle1['doc1'], merged['docs'][0])
eq_(expected_bundle1['doc2'], merged['docs'][1])
eq_(expected_bundle2['doc1'], merged['docs'][2])
eq_(expected_bundle2['doc2'], merged['docs'][3])
eq_(2, len(merged['indexes']))
merged['indexes'].sort(key=lambda i: i['key'])
eq_('en-US~moofirefox', merged['indexes'][0]['key'])
eq_('en-US~yayfirefox', merged['indexes'][1]['key'])
def test_index_generation(self):
p = product(title='firefox', save=True)
t = topic(title='topic1', product=p, save=True)
doc = document(title='firefox bookmarks',
locale='en-US', save=True)
revision(is_approved=True,
summary='this is an article about firefox bookmarks',
document=doc, save=True)
doc.products.add(p)
doc.topics.add(t)
doc2 = document(title='private browsing',
locale='en-US', save=True)
revision(is_approved=True,
summary='this is an article about private browsing',
document=doc2, save=True)
doc2.products.add(p)
doc2.topics.add(t)
bundle = utils.bundle_for_product(p, 'en-US')
index = bundle['indexes']['en-US~firefox']['index']
words_in_both = ('this', 'is', 'an', 'article', 'about')
for word in words_in_both:
assert word in index
eq_(2, len(index[word]))
eq_(2, len(index[word][0]))
eq_(2, len(index[word][1]))
assert 'firefox' in index
eq_(1, len(index['firefox']))
# Yeah. 'firefox' in this corpus _better_ score higher than 'this'.
assert index['firefox'][0][1] > index['this'][0][1]
assert 'bookmarks' in index
eq_(1, len(index['bookmarks']))
assert index['bookmarks'][0][1] > index['this'][0][1]
assert 'private' in index
eq_(1, len(index['private']))
assert index['private'][0][1] > index['this'][0][1]
assert 'browsing' in index
eq_(1, len(index['browsing']))
assert index['browsing'][0][1] > index['this'][0][1]
def test_archived_articles_in_bundle(self):
p = product(title='firefox', save=True)
t1 = topic(title='topic1', product=p, save=True)
doc = document(title='test', is_archived=True,
locale='en-US', save=True)
revision(is_approved=True, document=doc, save=True)
doc.products.add(p)
doc.topics.add(t1)
bundle = utils.bundle_for_product(p, 'en-US')
eq_(1, len(bundle['docs']))
doc = bundle['docs'].values()[0]
eq_(True, doc['archived'])
assert 'html' not in doc
eq_(1, len(bundle['topics']))
def test_redirect_articles_in_bundle(self):
p = product(title='firefox', save=True)
t1 = topic(title='topic1', product=p, save=True)
doc = document(title='test2', locale='en-US', save=True)
revision(is_approved=True,
document=doc,
save=True)
doc.products.add(p)
doc.topics.add(t1)
doc = document(title='test', locale='en-US', save=True)
revision(is_approved=True, document=doc, content=u'REDIRECT [[doc2]]',
save=True)
doc.products.add(p)
doc.topics.add(t1)
bundle = utils.bundle_for_product(p, 'en-US')
eq_(1, len(bundle['docs']))
doc = bundle['docs'].values()[0]
eq_('test2', doc['title'])
def test_bogus_articles_in_bundle(self):
p = product(title='firefox', save=True)
topic(title='topic1', product=p, save=True)
# Document with no revision should be fun
doc = document(title='test2', locale='en-US', save=True)
bundle = utils.bundle_for_product(p, 'en-US')
eq_(0, len(bundle['docs']))
eq_(0, len(bundle['topics']))
# article with no html.
revision(document=doc, content='', save=True)
bundle = utils.bundle_for_product(p, 'en-US')
eq_(0, len(bundle['docs']))
eq_(0, len(bundle['topics']))
def test_other_languages(self):
p = product(title='firefox', save=True)
t1 = topic(title='topic1', product=p, save=True)
doc = document(title='test', locale='en-US', save=True)
revision(is_approved=True, document=doc, save=True)
doc.products.add(p)
doc.topics.add(t1)
translated_doc = document(title=u'测试', locale='zh-CN', parent=doc,
save=True)
revision(is_approved=True, document=translated_doc, save=True)
bundle = utils.bundle_for_product(p, 'zh-CN')
eq_(1, len(bundle['docs']))
doc = bundle['docs'].values()[0]
eq_(u'测试', doc['title'])

Просмотреть файл

@ -0,0 +1,132 @@
import json
from nose import SkipTest
from nose.tools import eq_
from django.conf import settings
from kitsune.offline.cron import build_kb_bundles
from kitsune.products.tests import product, topic
from kitsune.sumo.tests import TestCase
from kitsune.sumo.urlresolvers import reverse
from kitsune.sumo.redis_utils import RedisError, redis_client
from kitsune.wiki.models import Document
from kitsune.wiki.tests import document, revision
class OfflineViewTests(TestCase):
def _create_bundle(self, prod, locale=settings.WIKI_DEFAULT_LANGUAGE):
p = product(title=prod, save=True)
t = topic(title='topic1', product=p, save=True)
if locale == settings.WIKI_DEFAULT_LANGUAGE:
parent = lambda i: None
else:
def parent(i):
d = document(title='test {0} {1}'.format(locale, i),
locale=settings.WIKI_DEFAULT_LANGUAGE,
save=True)
d.products.add(p)
d.topics.add(t)
d.save()
revision(summary='test article {0}'.format(i),
document=d,
is_approved=True,
save=True)
return d
for i in xrange(5):
d = document(title='test {0} {1}'.format(locale, i),
locale=locale, save=True)
revision(summary='test article {0}'.format(i),
document=d,
is_approved=True,
save=True)
d.products.add(p)
d.topics.add(t)
d.parent = parent(i)
d.save()
try:
build_kb_bundles((prod, ))
except RedisError:
pass # do nothing as we should gracefully fallback.
def test_get_single_bundle(self):
self._create_bundle('firefox', 'en-US')
url = reverse('offline.get_bundle') + '?locale=en-US&product=firefox'
resp = self.client.get(url, follow=True)
data = json.loads(resp.content)
assert 'locales' in data
eq_(1, len(data['locales']))
eq_([{u'slug': u'firefox', u'name': u'firefox'}],
data['locales'][0]['products'])
eq_('en-US', data['locales'][0]['key'])
assert 'topics' in data
eq_(1, len(data['topics']))
eq_('en-US~firefox~topic1', data['topics'][0]['key'])
eq_(5, len(data['topics'][0]['docs']))
assert 'docs' in data
eq_(5, len(data['docs']))
assert 'indexes' in data
def test_get_bundle_bad_request(self):
url = reverse('offline.get_bundle')
resp = self.client.get(url, follow=True)
eq_(400, resp.status_code)
data = json.loads(resp.content)
eq_('bad request', data['error'])
def test_get_bundle_not_found(self):
self._create_bundle('firefox', 'en-US')
url = reverse('offline.get_bundle') + '?locale=fr&product=redpanda'
resp = self.client.get(url, follow=True)
eq_(404, resp.status_code)
data = json.loads(resp.content)
eq_('not found', data['error'])
def test_get_bundle_meta(self):
self._create_bundle('firefox', 'en-US')
url = (reverse('offline.bundle_meta') +
'?locale=en-US&product=firefox')
try:
redis_client('default')
except RedisError:
raise SkipTest
resp = self.client.get(url, follow=True)
meta = json.loads(resp.content)
hash1 = meta['hash']
assert resp['Content-Type'] == 'application/json'
assert len(hash1) == 40 # sha1 hexdigest should be 40 char long.
doc = Document.objects.all()[0] # getting one document should be okay.
doc.title = 'some differnet title!'
doc.save()
# rebuild bundle as the version is different now.
build_kb_bundles(('firefox', ))
# test to see if the hash has changed.
resp = self.client.get(url, follow=True)
assert hash1 != json.loads(resp.content)['hash']
def test_get_language(self):
self._create_bundle('firefox', 'en-US')
resp = self.client.get(reverse('offline.get_languages'))
meta = json.loads(resp.content)
assert {'id': 'en-US', 'name': 'English'} in meta['languages']

9
kitsune/offline/urls.py Normal file
Просмотреть файл

@ -0,0 +1,9 @@
from django.conf.urls import patterns, url
# Note that these url do not get considered into the locale middleware.
# http://<base>/offline/get-bundle ... etc.
urlpatterns = patterns('kitsune.offline.views',
url(r'^/get-bundle$', 'get_bundle', name='offline.get_bundle'),
url(r'^/bundle-meta$', 'bundle_meta', name='offline.bundle_meta'),
url(r'^/get-languages$', 'get_languages', name='offline.get_languages')
)

214
kitsune/offline/utils.py Normal file
Просмотреть файл

@ -0,0 +1,214 @@
from hashlib import sha1
import json
import re
import time
from tower import ugettext as _
from django.conf import settings
from kitsune.offline.index import (
TFIDFIndex,
find_word_locations_with_spaces,
find_word_locations_without_spaces
)
from kitsune.wiki.config import TROUBLESHOOTING_CATEGORY, HOW_TO_CATEGORY
from kitsune.wiki.models import Document
_noscript_regex = re.compile(r'<noscript>.*?</noscript>', flags=re.DOTALL)
def bundle_key(locale, product_slug):
"""The key for a bundle as stored in client-side's indexeddb.
The arguments to this function must be strings. This key is used
for the index.
"""
return locale + '~' + product_slug
def doc_key(locale, doc_slug):
"""The key for a document as stored in client-side's indexeddb.
The arguments to this function must be strings.
"""
return locale + '~' + doc_slug
def topic_key(locale, product_slug, topic_slug):
"""The key for a topic as stored in client-side's indexeddb.
The arguments to this function must be strings.
"""
return locale + '~' + product_slug + '~' + topic_slug
def redis_bundle_name(locale, product_slug):
return 'osumo:' + bundle_key(locale.lower(), product_slug.lower())
def transform_html(dochtml):
"""Transforms the html to something we want to serve in the app.
Do things to the document html such as stripping out things the
offline app do not need. We could also do this in WikiParser,
but this is probably easier for now.
"""
# Strip out all the <noscript> images
dochtml = _noscript_regex.sub('', dochtml)
return dochtml
def serialize_document_for_offline(doc):
"""Grabs the document in a dictionary.
This method returns a document that is ready to be inserted into
the client-side database.
"""
# in order to save some space, the doc htmls and summaries are not returned
# as archived articles are already out of date.
if doc.is_archived:
return {
'key': doc_key(doc.locale, doc.slug),
'title': doc.title,
'archived': True,
'slug': doc.slug
}
else:
updated = int(time.mktime(doc.current_revision.created.timetuple()))
return {
'key': doc_key(doc.locale, doc.slug),
'title': doc.title,
'html': transform_html(doc.html),
'updated': updated,
'slug': doc.slug,
'id': doc.id,
'archived': False
}
def bundle_for_product(product, locale):
"""Gets an entire bundle for a product in a locale."""
bundle = {}
# put a new locale into the database.
bundle['locales'] = {}
bundle['locales'][locale] = {
'key': locale,
'name': settings.LANGUAGES[locale.lower()],
'products': [{'slug': product.slug, 'name': product.title}]
}
# we need a dictionary as we need to merge everything together.
bundle['topics'] = topics = {}
bundle['docs'] = docs_bundle = {}
bundle['indexes'] = {}
index_builder = TFIDFIndex()
docs = Document.objects.filter(
locale=locale,
is_template=False,
category__in=(TROUBLESHOOTING_CATEGORY, HOW_TO_CATEGORY)
)
# Since the any languages that are derived from English will not have a
# product, we must find its parent's product.
if locale == settings.WIKI_DEFAULT_LANGUAGE:
docs = docs.filter(products__id=product.id)
else:
docs = docs.filter(parent__products__id=product.id)
if locale in settings.LANGUAGES_WITHOUT_SPACES:
find_word_locations = find_word_locations_without_spaces
else:
find_word_locations = find_word_locations_with_spaces
for doc in docs:
if not doc.current_revision or not doc.html or doc.redirect_url():
# These documents don't have approved revision. We just skip them.
# or if it is a redirect.. why even bother.
continue
serialized_doc = serialize_document_for_offline(doc)
# Only non-archived documents need to be indexed.
if not doc.is_archived:
# We only index the title and the summary as otherwise the corpus
# is too big. We also boost the score of the title.
texts = [(doc.title, 1.2), (doc.current_revision.summary, 1)]
index_builder.feed(doc.id, texts, find_word_locations)
docs_bundle[serialized_doc['key']] = serialized_doc
# Now we need to populate the topics for this locale.
for t in doc.get_topics():
if t.product.id == product.id:
topic = topics.setdefault(t.slug, {})
if not topic: # this means that topics has not been set yet.
topic['key'] = topic_key(locale, product.slug, t.slug)
# The title of the document is not translated so we must
# use gettext to get the translation for it.
topic['name'] = _(t.title)
topic['children'] = [st.slug for st in t.subtopics.all()]
topic['docs'] = []
topic['product'] = product.slug
topic['slug'] = t.slug
topic['docs'].append(doc.slug)
# The bundle needs an index!
bundlekey = bundle_key(locale, product.slug)
bundle['indexes'][bundlekey] = {}
bundle['indexes'][bundlekey]['key'] = bundlekey
# The client side will search through this index.
bundle['indexes'][bundlekey]['index'] = index_builder.offline_index()
return bundle
def merge_bundles(*bundles):
"""Merges multiple bundles generated by bundle_for_product into one.
"""
merged_bundle = {}
for bundle in bundles:
if 'locales' in bundle:
merged_locales = merged_bundle.setdefault('locales', {})
for k, locale in bundle['locales'].iteritems():
merged_locale = merged_locales.setdefault(k, {})
if merged_locale:
merged_locale['products'].extend(locale['products'])
else:
merged_locale.update(locale)
for key in ('topics', 'docs', 'indexes'):
if key in bundle:
merged_bundle.setdefault(key, {}).update(bundle[key])
# This is because the database format is actually meant to have all of this
# in a list format
for key in ('locales', 'topics', 'docs', 'indexes'):
if key in merged_bundle:
merged_bundle[key] = merged_bundle[key].values()
return merged_bundle
def insert_bundle_into_redis(redis, product, locale, bundle):
"""Put a bundle into redis.
This is used in both the cron job and the view.
"""
bundle = json.dumps(bundle)
# track version. Used instead of a timestamp as there may be instances when
# nothing is updated between last generation and now.
bundle_hash = sha1(bundle).hexdigest()
name = redis_bundle_name(locale.lower(), product.lower())
redis.hset(name, 'hash', bundle_hash)
redis.hset(name, 'bundle', bundle)
redis.hset(name, 'updated', time.time())
return bundle, bundle_hash

84
kitsune/offline/views.py Normal file
Просмотреть файл

@ -0,0 +1,84 @@
import json
from django.conf import settings
from django.http import (HttpResponse,
HttpResponseBadRequest,
HttpResponseNotFound)
from kitsune.offline.utils import redis_bundle_name
from kitsune.sumo.decorators import cors_enabled
from kitsune.sumo.redis_utils import redis_client, RedisError
INVALID_LOCALE = '{"error": "not found", "reason": "invalid locale"}'
NOT_FOUND = '{"error": "not found", "reason": "unknown"}'
BAD_REQUEST = '{"error": "bad request", "reason": "incomplete request"}'
@cors_enabled('*')
def get_bundle(request):
if 'locale' not in request.GET or 'product' not in request.GET:
return HttpResponseBadRequest(BAD_REQUEST, mimetype='application/json')
locale = request.GET['locale']
product = request.GET['product']
if locale.lower() not in settings.LANGUAGES:
return HttpResponseNotFound(INVALID_LOCALE,
mimetype='application/json')
name = redis_bundle_name(locale, product)
try:
redis = redis_client('default')
except RedisError:
return HttpResponse('not available yet', status=503)
else:
bundle = redis.hget(name, 'bundle')
bundle_hash = redis.hget(name, 'hash')
if bundle is None:
return HttpResponseNotFound(NOT_FOUND, mimetype='application/json')
response = HttpResponse(bundle, mimetype='application/json')
response['Content-Length'] = len(bundle)
response['X-Content-Hash'] = bundle_hash
response['Access-Control-Expose-Headers'] = \
'Content-Length, X-Content-Hash'
return response
@cors_enabled('*')
def bundle_meta(request):
"""This view is responsible for update checking."""
if 'locale' not in request.GET or 'product' not in request.GET:
return HttpResponseBadRequest(BAD_REQUEST, mimetype='application/json')
locale = request.GET['locale']
product = request.GET['product']
name = redis_bundle_name(locale, product)
try:
redis = redis_client('default')
except RedisError:
return HttpResponse('{"error": "no bundles available"}',
mimetype='application/json',
status=503)
bundle_hash = redis.hget(name, 'hash')
if bundle_hash:
u = {'hash': bundle_hash}
return HttpResponse(json.dumps(u), mimetype='application/json')
else:
return HttpResponseNotFound(NOT_FOUND, mimetype='application/json')
@cors_enabled('*')
def get_languages(request):
"""Responsible for telling what the support languages are"""
languages = []
for code, name in settings.LANGUAGE_CHOICES:
languages.append({'id': code, 'name': name})
return HttpResponse(json.dumps({'languages': languages}),
mimetype='application/json')

Просмотреть файл

@ -251,6 +251,15 @@ ES_PLUGIN_ANALYZERS = [
ES_USE_PLUGINS = False ES_USE_PLUGINS = False
# These are for the indexer for the offline sumo app.
LANGUAGES_WITHOUT_SPACES = (
'zh-CN',
'zh-TW',
'ja',
'ko',
'my'
)
TEXT_DOMAIN = 'messages' TEXT_DOMAIN = 'messages'
SITE_ID = 1 SITE_ID = 1
@ -309,6 +318,7 @@ SUPPORTED_NONLOCALES = (
'api', 'api',
'favicon.ico', 'favicon.ico',
'media', 'media',
'offline',
'postcrash', 'postcrash',
'robots.txt', 'robots.txt',
'services', 'services',
@ -486,6 +496,7 @@ INSTALLED_APPS = (
'kitsune.karma', 'kitsune.karma',
'kitsune.tags', 'kitsune.tags',
'kitsune.kpi', 'kitsune.kpi',
'kitsune.offline',
'kitsune.products', 'kitsune.products',
'rest_framework', 'rest_framework',

Просмотреть файл

@ -66,3 +66,33 @@ def json_view(f):
}) })
return http.HttpResponseServerError(blob, content_type=JSON) return http.HttpResponseServerError(blob, content_type=JSON)
return _wrapped return _wrapped
def cors_enabled(origin, methods=['GET']):
"""A simple decorator to enable CORS."""
def decorator(f):
@wraps(f)
def decorated_func(request, *args, **kwargs):
if request.method == 'OPTIONS':
# preflight
if ('HTTP_ACCESS_CONTROL_REQUEST_METHOD' in request.META and
'HTTP_ACCESS_CONTROL_REQUEST_HEADERS' in request.META):
response = http.HttpResponse()
response['Access-Control-Allow-Methods'] = ", ".join(
methods)
# TODO: We might need to change this
response['Access-Control-Allow-Headers'] = \
request.META['HTTP_ACCESS_CONTROL_REQUEST_HEADERS']
else:
return http.HttpResponseBadRequest()
elif request.method in methods:
response = f(request, *args, **kwargs)
else:
return http.HttpResponseBadRequest()
response['Access-Control-Allow-Origin'] = origin
return response
return decorated_func
return decorator

Просмотреть файл

@ -35,6 +35,7 @@ urlpatterns = patterns('',
(r'^products', include('kitsune.products.urls')), (r'^products', include('kitsune.products.urls')),
(r'^announcements', include('kitsune.announcements.urls')), (r'^announcements', include('kitsune.announcements.urls')),
(r'^badges/', include('kitsune.kbadge.urls')), (r'^badges/', include('kitsune.kbadge.urls')),
(r'^offline', include('kitsune.offline.urls')),
# Kitsune admin (not Django admin). # Kitsune admin (not Django admin).
(r'^admin/', include(admin.site.urls)), (r'^admin/', include(admin.site.urls)),

Просмотреть файл

@ -34,6 +34,7 @@ HOME = /tmp
0 5 * * * {{ cron }} reindex_kb 0 5 * * * {{ cron }} reindex_kb
0 6 * * * {{ cron }} process_exit_surveys 0 6 * * * {{ cron }} process_exit_surveys
0 1 * * * {{ cron }} update_l10n_coverage_metrics 0 1 * * * {{ cron }} update_l10n_coverage_metrics
45 4 * * * {{ cron }} build_kb_bundles
# Twice per week. # Twice per week.
#05 01 * * 1,4 {{ cron }} update_weekly_votes #05 01 * * 1,4 {{ cron }} update_weekly_votes