зеркало из https://github.com/mozilla/kitsune.git
Added API for oSUMO.
This commit is contained in:
Родитель
62a5803df0
Коммит
cf8ccaa583
|
@ -33,6 +33,7 @@ Part 2: Developer's Guide
|
|||
karma
|
||||
vendor
|
||||
wikidocs
|
||||
osumo
|
||||
notes
|
||||
licenses
|
||||
|
||||
|
|
|
@ -0,0 +1,23 @@
|
|||
.. _osumo-chapter:
|
||||
|
||||
============
|
||||
Offline SUMO
|
||||
============
|
||||
|
||||
The primary documentation for offline sumo lives here:
|
||||
https://osumo.readthedocs.org. The source lives at
|
||||
https://gihub.com/mozilla/osumo.
|
||||
|
||||
Offline SUMO requires a component on Kitsune and this component relies heavily
|
||||
on Redis as we generate all the articles once a day and put it into
|
||||
Kitsune. Make sure that is available.
|
||||
|
||||
The code for offline sumo's bundle generation lives under
|
||||
`kitsune/offline`. Inside, there are a couple of files defined:
|
||||
|
||||
- utils.py does the actual bundle generation.
|
||||
- index.py is responsible for generating the index for offline search.
|
||||
- urls.py defines the url for Django.
|
||||
- views.py implements the two views.
|
||||
- cron.py has the cron job that runs daily for bundle generation.
|
||||
- tests/ has unittests.
|
|
@ -0,0 +1,72 @@
|
|||
import datetime
|
||||
import logging
|
||||
|
||||
from django.conf import settings
|
||||
from django.contrib import admin, messages
|
||||
from django.shortcuts import render
|
||||
|
||||
from kitsune.offline.cron import build_kb_bundles
|
||||
from kitsune.sumo.redis_utils import redis_client
|
||||
|
||||
|
||||
log = logging.getLogger('k.offline')
|
||||
|
||||
|
||||
def offline_admin(request):
|
||||
redis = redis_client('default')
|
||||
|
||||
action = request.POST.get('action')
|
||||
if action == 'generate_all':
|
||||
log.info('Requested regenerating all bundles.')
|
||||
build_kb_bundles()
|
||||
messages.add_message(request, messages.SUCCESS,
|
||||
'Bundles regenerated!')
|
||||
elif action == 'delete_all':
|
||||
if redis.delete(*redis.keys('osumo:*')):
|
||||
messages.add_message(request, messages.SUCCESS,
|
||||
'Deleted all bundles!')
|
||||
else:
|
||||
messages.add_message(request, messages.ERROR,
|
||||
'Bundle deleting failed.')
|
||||
|
||||
keys = redis.keys('osumo:*')
|
||||
bundles = []
|
||||
totalsize = 0
|
||||
for key in keys:
|
||||
bundle = {}
|
||||
# reverse operation to redis_bundle_name, the schema is:
|
||||
# osumo:locale~product
|
||||
tmp = key.split(':')[1].split('~')
|
||||
|
||||
locale, bundle['product'] = tuple(tmp)
|
||||
# to get the non .lower()'ed version.
|
||||
locale = settings.LANGUAGE_URL_MAP[locale]
|
||||
bundle['locale'] = settings.LOCALES[locale].english
|
||||
|
||||
bundle['hash'] = redis.hget(key, 'hash')
|
||||
|
||||
updated = float(redis.hget(key, 'updated'))
|
||||
updated = datetime.datetime.fromtimestamp(updated)
|
||||
bundle['updated'] = updated.strftime('%Y-%m-%d %H:%M:%S')
|
||||
|
||||
bundle['size'] = round(len(redis.hget(key, 'bundle')) / 1024.0, 2)
|
||||
totalsize += bundle['size']
|
||||
|
||||
bundles.append(bundle)
|
||||
|
||||
# Sorting by by locale and then product
|
||||
bundles.sort(key=lambda x: x['locale'] + x['product'])
|
||||
|
||||
totalsize /= 1024
|
||||
totalsize = round(totalsize, 2)
|
||||
|
||||
return render(request,
|
||||
'admin/offline.html',
|
||||
{'title': 'Offline SUMO Administration',
|
||||
'bundles': bundles,
|
||||
'totalsize': totalsize})
|
||||
|
||||
|
||||
admin.site.register_view('offline',
|
||||
offline_admin,
|
||||
'Offline SUMO Administration')
|
|
@ -0,0 +1,46 @@
|
|||
import logging
|
||||
import time
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
from cronjobs import register
|
||||
from statsd import statsd
|
||||
|
||||
from kitsune.offline.utils import (
|
||||
bundle_for_product,
|
||||
merge_bundles,
|
||||
insert_bundle_into_redis
|
||||
)
|
||||
from kitsune.products.models import Product
|
||||
from kitsune.sumo.utils import uselocale
|
||||
from kitsune.sumo.redis_utils import redis_client
|
||||
|
||||
|
||||
log = logging.getLogger('k.offline')
|
||||
|
||||
|
||||
@register
|
||||
def build_kb_bundles(products=('firefox-os', 'firefox', 'mobile')):
|
||||
redis = redis_client('default')
|
||||
|
||||
if not redis:
|
||||
raise IOError('Redis not available. Cannot generate offline bundles.')
|
||||
|
||||
start_time = time.time()
|
||||
size = 0
|
||||
|
||||
products = [Product.objects.get(slug=p) for p in products]
|
||||
with statsd.timer('offline.build_kb_bundles.time_elapsed'):
|
||||
for locale in settings.SUMO_LANGUAGES:
|
||||
for product in products:
|
||||
with uselocale(locale):
|
||||
bundle = merge_bundles(bundle_for_product(product, locale))
|
||||
|
||||
size += len(insert_bundle_into_redis(redis,
|
||||
product.slug,
|
||||
locale,
|
||||
bundle)[0])
|
||||
|
||||
time_taken = time.time() - start_time
|
||||
log.info('Generated all offline bundles. '
|
||||
'Size: {0}. Took {1} seconds'.format(size, time_taken))
|
|
@ -0,0 +1,171 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
from __future__ import division
|
||||
|
||||
import math
|
||||
import string
|
||||
import re
|
||||
|
||||
|
||||
_whitespace_regex = re.compile(r'\s|-', flags=re.U)
|
||||
_alpha_regex = re.compile(r'\w', flags=re.U)
|
||||
|
||||
|
||||
def find_word_locations_with_spaces(s):
|
||||
"""Builds an index in the format of {word: location}.
|
||||
|
||||
This is an English like search. For languages without spaces to
|
||||
separate words, use find_word_locations_without_spaces.
|
||||
|
||||
This is a futureproof function. If we need to add location based indexing
|
||||
for better searches with multiple search terms (especially for languages
|
||||
like Chinese, Japanese, and Korean), we need to find each words index.
|
||||
|
||||
In this routine, we separate words at end of sentences by 2 as a gap and by
|
||||
1 if words are separated by a comma (or alike).
|
||||
|
||||
Right now, the routine is only used to get the words count in TFIDFIndex.
|
||||
"""
|
||||
s = s.lower()
|
||||
words = [u'']
|
||||
for c in s:
|
||||
if c in '\'"[]1234567890/\\()_':
|
||||
continue
|
||||
elif c in '.!?': # We want to treat . as a big stop. Add two space.
|
||||
words.append(u'')
|
||||
words.append(u'')
|
||||
elif _whitespace_regex.match(c) or c in string.punctuation:
|
||||
words.append(u'')
|
||||
elif _alpha_regex.match(c) is not None:
|
||||
words[-1] += c
|
||||
else:
|
||||
# characters that we don't care about such as a control character.
|
||||
# It's okay if we skip it.
|
||||
continue
|
||||
|
||||
locations = {}
|
||||
for i, w in enumerate(words):
|
||||
if w:
|
||||
locations.setdefault(w, []).append(i)
|
||||
|
||||
return locations
|
||||
|
||||
|
||||
def find_word_locations_without_spaces(s):
|
||||
"""Builds an index of the format of {word: location}.
|
||||
|
||||
This method is for languages like Chinese where there is no spaces
|
||||
to denote the beginning and end of a word.
|
||||
"""
|
||||
words = [u'']
|
||||
for c in s:
|
||||
if c in u'\'"[]1234567890/\\()_()【】『』、¥《》’‘”“':
|
||||
continue
|
||||
# This is at least the punctuations in Chinese.
|
||||
elif c in u'。!?':
|
||||
words.append(u'')
|
||||
words.append(u'')
|
||||
# Yes, east asian languages could still have white space.
|
||||
elif _whitespace_regex.match(c) or c in u";:,、" + string.punctuation:
|
||||
words.append(u'')
|
||||
elif _alpha_regex.match(c) is not None:
|
||||
words.append(c)
|
||||
else:
|
||||
# Something weird, but it is totally okay.
|
||||
# this character is probably not significant (maybe invisble)
|
||||
continue
|
||||
|
||||
locations = {}
|
||||
for i, w in enumerate(words):
|
||||
if w:
|
||||
locations.setdefault(w, []).append(i)
|
||||
return locations
|
||||
|
||||
|
||||
class TFIDFIndex(object):
|
||||
"""This is an index for search and ranking based on TF-IDF.
|
||||
|
||||
TF-IDF (Term Frequency - Inverse Document Frequency) is a relatively
|
||||
simple and intuitive NLP technique that scores words in a document
|
||||
given a corpus based on how important this word is.
|
||||
|
||||
A full explanation of this is provided at
|
||||
http://osumo.readthedocs.org/en/latest/offlinesearch.html#index-structure.
|
||||
"""
|
||||
def __init__(self):
|
||||
self.doc_count = 0
|
||||
self.global_word_freq = {}
|
||||
self.local_word_freq = {}
|
||||
self.docs_words_boosts = {}
|
||||
|
||||
def feed(self, doc_id, texts, get_locations):
|
||||
self.doc_count += 1
|
||||
self.local_word_freq.setdefault(doc_id, {})
|
||||
self.docs_words_boosts.setdefault(doc_id, {})
|
||||
|
||||
for text, boost in texts:
|
||||
|
||||
locations = get_locations(text)
|
||||
for w, loc in locations.iteritems():
|
||||
global_freq = self.global_word_freq.setdefault(w, 0)
|
||||
local_freq = len(loc)
|
||||
self.global_word_freq[w] = global_freq + local_freq
|
||||
|
||||
old_local_freq = self.local_word_freq[doc_id].setdefault(w, 0)
|
||||
self.local_word_freq[doc_id][w] = old_local_freq + local_freq
|
||||
|
||||
boost = max(self.docs_words_boosts[doc_id].get(w, 0), boost)
|
||||
|
||||
if boost != 1: # save some space..
|
||||
self.docs_words_boosts[doc_id][w] = boost
|
||||
|
||||
def _f(self, term, doc_id):
|
||||
"""The frequency of a certain term in a certain document."""
|
||||
return self.local_word_freq[doc_id][term]
|
||||
|
||||
def _tf(self, term, doc_id):
|
||||
"""The term frequency term of the TF-IDF formula.
|
||||
|
||||
Adapted from Wikipedia:
|
||||
tf(t, d) = 0.5 + \\frac{0.5 f(t, d)}{max(f(w, d), w \in d)}
|
||||
"""
|
||||
o = self._f(term, doc_id) / max(self.local_word_freq[doc_id].values())
|
||||
return 0.5 + (0.5 * o)
|
||||
|
||||
def _idf(self, term):
|
||||
"""The inverse document frequency term from the TF-IDF formula.
|
||||
|
||||
Adapted from Wikipedia.
|
||||
idf(t, D) = \log \\frac{|D|}{|{d \in D : t \in D}|}
|
||||
"""
|
||||
appearance = 0
|
||||
for doc_id, words in self.local_word_freq.iteritems():
|
||||
appearance += 1 if term in words else 0
|
||||
|
||||
return math.log(self.doc_count / appearance, 2)
|
||||
|
||||
def tfidf(self, term, doc_id):
|
||||
"""The whole formula together for TF-IDF.
|
||||
|
||||
Adapted from Wikipedia.
|
||||
"""
|
||||
boost = self.docs_words_boosts[doc_id].get(term, 1)
|
||||
return self._tf(term, doc_id) * self._idf(term) * boost
|
||||
|
||||
def tfidf_doc(self, doc_id):
|
||||
"""Computes the TF-IDF score for each term in a document."""
|
||||
doc = self.local_word_freq[doc_id]
|
||||
scores = []
|
||||
for word in doc:
|
||||
scores.append((word, round(self.tfidf(word, doc_id), 2)))
|
||||
scores.sort(key=lambda x: x[1], reverse=True)
|
||||
return scores
|
||||
|
||||
def offline_index(self):
|
||||
"""Builds the offline index."""
|
||||
index = {}
|
||||
for doc_id in self.local_word_freq:
|
||||
scores = self.tfidf_doc(doc_id)
|
||||
for word, score in scores:
|
||||
l = index.setdefault(word, [])
|
||||
l.append((doc_id, score))
|
||||
return index
|
|
@ -0,0 +1,59 @@
|
|||
{% extends "kadmin/base.html" %}
|
||||
|
||||
{% block content_title %}
|
||||
<h1>Offline SUMO Administration</h1>
|
||||
{% endblock %}
|
||||
|
||||
{% block content %}
|
||||
<section>
|
||||
<h2>Currently available bundles</h2>
|
||||
{% if bundles %}
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Locale</th>
|
||||
<th>Product</th>
|
||||
<th>Bundle Hash</th>
|
||||
<th>Last updated (server time)</th>
|
||||
<th>Size</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for bundle in bundles %}
|
||||
<tr>
|
||||
<td>{{ bundle.locale }}</td>
|
||||
<td>{{ bundle.product }}</td>
|
||||
<td>{{ bundle.hash }}</td>
|
||||
<td>{{ bundle.updated }}</td>
|
||||
<td>{{ bundle.size }} KB</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
<tr>
|
||||
<td><strong>All</strong></td>
|
||||
<td><strong>All</strong></td>
|
||||
<td> --- </td>
|
||||
<td> --- </td>
|
||||
<td><strong>{{ totalsize }} MB</strong></td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<p>Note sizes are raw sizes of the JSON. May not reflect the actual size in Redis or the ones downloaded due to compression.</p>
|
||||
{% else %}
|
||||
<p>No bundles are in Redis. Please generate them.</p>
|
||||
{% endif %}
|
||||
</section>
|
||||
|
||||
<section>
|
||||
<h2>Database administrations</h2>
|
||||
<form method="POST">
|
||||
{% csrf_token %}
|
||||
<input type="hidden" name="action" value="generate_all" />
|
||||
<input type="submit" value="Regenerate all bundles (This may take a while)" />
|
||||
</form>
|
||||
<form method="POST">
|
||||
{% csrf_token %}
|
||||
<input type="hidden" name="action" value="delete_all" />
|
||||
<input type="submit" value="Delete all bundles" />
|
||||
</form>
|
||||
</section>
|
||||
{% endblock %}
|
|
@ -0,0 +1,298 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import time
|
||||
|
||||
from nose.tools import eq_
|
||||
|
||||
from kitsune.offline import utils
|
||||
from kitsune.products.tests import product, topic
|
||||
from kitsune.sumo.tests import TestCase
|
||||
from kitsune.wiki.tests import document, revision
|
||||
|
||||
|
||||
def _create_doc(title='', product=None, topic=None, is_archived=False):
|
||||
title = 'test ' + title if title else 'test'
|
||||
doc = document(title=title, save=True, is_archived=is_archived)
|
||||
revision(summary='summary', is_approved=True, document=doc, save=True)
|
||||
|
||||
if is_archived:
|
||||
expected = {
|
||||
'key': 'en-US~' + doc.slug,
|
||||
'title': doc.title,
|
||||
'archived': True,
|
||||
'slug': doc.slug
|
||||
}
|
||||
else:
|
||||
updated = time.mktime(doc.current_revision.created.timetuple())
|
||||
expected = {
|
||||
'key': 'en-US~' + doc.slug,
|
||||
'title': title,
|
||||
'html': doc.html,
|
||||
'updated': updated,
|
||||
'slug': doc.slug,
|
||||
'id': doc.id,
|
||||
'archived': False
|
||||
}
|
||||
|
||||
if product:
|
||||
doc.products.add(product)
|
||||
|
||||
if topic:
|
||||
doc.topics.add(topic)
|
||||
|
||||
return doc, expected
|
||||
|
||||
|
||||
def _create_product_bundle(prefix='moo'):
|
||||
p = product(title=prefix + 'firefox', save=True)
|
||||
t1 = topic(title=prefix + 'topic1', product=p, save=True)
|
||||
t2 = topic(title=prefix + 'topic2', product=p, save=True)
|
||||
|
||||
doc1, expected_doc1 = _create_doc(title=prefix + 'doc1',
|
||||
product=p, topic=t1)
|
||||
doc2, expected_doc2 = _create_doc(title=prefix + 'doc2',
|
||||
product=p, topic=t2)
|
||||
|
||||
expected_locale_doc = {
|
||||
'key': u'en-US',
|
||||
'name': u'English',
|
||||
'products': [{
|
||||
'slug': p.slug,
|
||||
'name': p.title
|
||||
}]
|
||||
}
|
||||
|
||||
expected_topic1 = {
|
||||
'key': 'en-US~' + p.slug + '~' + t1.slug,
|
||||
'name': t1.title,
|
||||
'docs': [doc1.slug],
|
||||
'product': p.slug,
|
||||
'slug': t1.slug,
|
||||
'children': []
|
||||
}
|
||||
|
||||
expected_topic2 = {
|
||||
'key': 'en-US~' + p.slug + '~' + t2.slug,
|
||||
'name': t2.title,
|
||||
'docs': [doc2.slug],
|
||||
'product': p.slug,
|
||||
'slug': t2.slug,
|
||||
'children': []
|
||||
}
|
||||
|
||||
return p, {
|
||||
'doc1': expected_doc1,
|
||||
'doc2': expected_doc2,
|
||||
'locale': expected_locale_doc,
|
||||
'topic1': expected_topic1,
|
||||
'topic2': expected_topic2
|
||||
}
|
||||
|
||||
|
||||
class OfflineWikiDataGenerationTest(TestCase):
|
||||
def test_serialize_document(self):
|
||||
doc, expected = _create_doc()
|
||||
serialized = utils.serialize_document_for_offline(doc)
|
||||
eq_(expected, serialized)
|
||||
|
||||
def test_serialized_archived_document(self):
|
||||
doc, expected = _create_doc(is_archived=True)
|
||||
serialized = utils.serialize_document_for_offline(doc)
|
||||
eq_(expected, serialized)
|
||||
|
||||
def test_bundle_for_product(self):
|
||||
p, expected_bundle = _create_product_bundle()
|
||||
|
||||
bundle = utils.bundle_for_product(p, 'en-US')
|
||||
|
||||
assert 'locales' in bundle
|
||||
eq_(1, len(bundle['locales']))
|
||||
eq_(expected_bundle['locale'], bundle['locales'].values()[0])
|
||||
|
||||
assert 'topics' in bundle
|
||||
eq_(2, len(bundle['topics']))
|
||||
topics = sorted(bundle['topics'].values(), key=lambda t: t['slug'])
|
||||
eq_(expected_bundle['topic1'], topics[0])
|
||||
eq_(expected_bundle['topic2'], topics[1])
|
||||
|
||||
assert 'docs' in bundle
|
||||
docs = sorted(bundle['docs'].values(), key=lambda d: d['title'])
|
||||
eq_(expected_bundle['doc1'], docs[0])
|
||||
eq_(expected_bundle['doc2'], docs[1])
|
||||
|
||||
assert 'indexes' in bundle
|
||||
eq_(1, len(bundle['indexes']))
|
||||
assert 'en-US~moofirefox' in bundle['indexes']
|
||||
assert 'index' in bundle['indexes']['en-US~moofirefox']
|
||||
eq_(u'en-US~moofirefox', bundle['indexes']['en-US~moofirefox']['key'])
|
||||
|
||||
def test_merge_bundles(self):
|
||||
p1, expected_bundle1 = _create_product_bundle()
|
||||
p2, expected_bundle2 = _create_product_bundle('yay')
|
||||
|
||||
bundle1 = utils.bundle_for_product(p1, 'en-US')
|
||||
bundle2 = utils.bundle_for_product(p2, 'en-US')
|
||||
|
||||
merged = utils.merge_bundles(bundle1, bundle2)
|
||||
|
||||
assert 'locales' in merged
|
||||
eq_(1, len(merged['locales']))
|
||||
|
||||
expected_locale = expected_bundle1['locale']
|
||||
expected_locale['products'] += expected_bundle2['locale']['products']
|
||||
|
||||
eq_(expected_locale, merged['locales'][0])
|
||||
|
||||
assert 'topics' in merged
|
||||
eq_(4, len(merged['topics']))
|
||||
|
||||
merged['topics'].sort(key=lambda t: t['slug'])
|
||||
|
||||
eq_(expected_bundle1['topic1'], merged['topics'][0])
|
||||
eq_(expected_bundle1['topic2'], merged['topics'][1])
|
||||
eq_(expected_bundle2['topic1'], merged['topics'][2])
|
||||
eq_(expected_bundle2['topic2'], merged['topics'][3])
|
||||
|
||||
assert 'docs' in merged
|
||||
eq_(4, len(merged['docs']))
|
||||
|
||||
merged['docs'].sort(key=lambda d: d['title'])
|
||||
|
||||
eq_(expected_bundle1['doc1'], merged['docs'][0])
|
||||
eq_(expected_bundle1['doc2'], merged['docs'][1])
|
||||
eq_(expected_bundle2['doc1'], merged['docs'][2])
|
||||
eq_(expected_bundle2['doc2'], merged['docs'][3])
|
||||
|
||||
eq_(2, len(merged['indexes']))
|
||||
merged['indexes'].sort(key=lambda i: i['key'])
|
||||
eq_('en-US~moofirefox', merged['indexes'][0]['key'])
|
||||
eq_('en-US~yayfirefox', merged['indexes'][1]['key'])
|
||||
|
||||
def test_index_generation(self):
|
||||
p = product(title='firefox', save=True)
|
||||
t = topic(title='topic1', product=p, save=True)
|
||||
|
||||
doc = document(title='firefox bookmarks',
|
||||
locale='en-US', save=True)
|
||||
|
||||
revision(is_approved=True,
|
||||
summary='this is an article about firefox bookmarks',
|
||||
document=doc, save=True)
|
||||
|
||||
doc.products.add(p)
|
||||
doc.topics.add(t)
|
||||
|
||||
doc2 = document(title='private browsing',
|
||||
locale='en-US', save=True)
|
||||
|
||||
revision(is_approved=True,
|
||||
summary='this is an article about private browsing',
|
||||
document=doc2, save=True)
|
||||
|
||||
doc2.products.add(p)
|
||||
doc2.topics.add(t)
|
||||
|
||||
bundle = utils.bundle_for_product(p, 'en-US')
|
||||
index = bundle['indexes']['en-US~firefox']['index']
|
||||
|
||||
words_in_both = ('this', 'is', 'an', 'article', 'about')
|
||||
|
||||
for word in words_in_both:
|
||||
assert word in index
|
||||
eq_(2, len(index[word]))
|
||||
eq_(2, len(index[word][0]))
|
||||
eq_(2, len(index[word][1]))
|
||||
|
||||
assert 'firefox' in index
|
||||
eq_(1, len(index['firefox']))
|
||||
# Yeah. 'firefox' in this corpus _better_ score higher than 'this'.
|
||||
assert index['firefox'][0][1] > index['this'][0][1]
|
||||
|
||||
assert 'bookmarks' in index
|
||||
eq_(1, len(index['bookmarks']))
|
||||
assert index['bookmarks'][0][1] > index['this'][0][1]
|
||||
|
||||
assert 'private' in index
|
||||
eq_(1, len(index['private']))
|
||||
assert index['private'][0][1] > index['this'][0][1]
|
||||
|
||||
assert 'browsing' in index
|
||||
eq_(1, len(index['browsing']))
|
||||
assert index['browsing'][0][1] > index['this'][0][1]
|
||||
|
||||
def test_archived_articles_in_bundle(self):
|
||||
p = product(title='firefox', save=True)
|
||||
t1 = topic(title='topic1', product=p, save=True)
|
||||
|
||||
doc = document(title='test', is_archived=True,
|
||||
locale='en-US', save=True)
|
||||
revision(is_approved=True, document=doc, save=True)
|
||||
doc.products.add(p)
|
||||
doc.topics.add(t1)
|
||||
|
||||
bundle = utils.bundle_for_product(p, 'en-US')
|
||||
eq_(1, len(bundle['docs']))
|
||||
doc = bundle['docs'].values()[0]
|
||||
eq_(True, doc['archived'])
|
||||
assert 'html' not in doc
|
||||
eq_(1, len(bundle['topics']))
|
||||
|
||||
def test_redirect_articles_in_bundle(self):
|
||||
p = product(title='firefox', save=True)
|
||||
t1 = topic(title='topic1', product=p, save=True)
|
||||
|
||||
doc = document(title='test2', locale='en-US', save=True)
|
||||
revision(is_approved=True,
|
||||
document=doc,
|
||||
save=True)
|
||||
|
||||
doc.products.add(p)
|
||||
doc.topics.add(t1)
|
||||
|
||||
doc = document(title='test', locale='en-US', save=True)
|
||||
revision(is_approved=True, document=doc, content=u'REDIRECT [[doc2]]',
|
||||
save=True)
|
||||
|
||||
doc.products.add(p)
|
||||
doc.topics.add(t1)
|
||||
|
||||
bundle = utils.bundle_for_product(p, 'en-US')
|
||||
eq_(1, len(bundle['docs']))
|
||||
doc = bundle['docs'].values()[0]
|
||||
eq_('test2', doc['title'])
|
||||
|
||||
def test_bogus_articles_in_bundle(self):
|
||||
p = product(title='firefox', save=True)
|
||||
topic(title='topic1', product=p, save=True)
|
||||
|
||||
# Document with no revision should be fun
|
||||
doc = document(title='test2', locale='en-US', save=True)
|
||||
|
||||
bundle = utils.bundle_for_product(p, 'en-US')
|
||||
eq_(0, len(bundle['docs']))
|
||||
eq_(0, len(bundle['topics']))
|
||||
|
||||
# article with no html.
|
||||
revision(document=doc, content='', save=True)
|
||||
bundle = utils.bundle_for_product(p, 'en-US')
|
||||
eq_(0, len(bundle['docs']))
|
||||
eq_(0, len(bundle['topics']))
|
||||
|
||||
def test_other_languages(self):
|
||||
p = product(title='firefox', save=True)
|
||||
t1 = topic(title='topic1', product=p, save=True)
|
||||
|
||||
doc = document(title='test', locale='en-US', save=True)
|
||||
revision(is_approved=True, document=doc, save=True)
|
||||
|
||||
doc.products.add(p)
|
||||
doc.topics.add(t1)
|
||||
|
||||
translated_doc = document(title=u'测试', locale='zh-CN', parent=doc,
|
||||
save=True)
|
||||
revision(is_approved=True, document=translated_doc, save=True)
|
||||
|
||||
bundle = utils.bundle_for_product(p, 'zh-CN')
|
||||
eq_(1, len(bundle['docs']))
|
||||
|
||||
doc = bundle['docs'].values()[0]
|
||||
eq_(u'测试', doc['title'])
|
|
@ -0,0 +1,132 @@
|
|||
import json
|
||||
|
||||
from nose import SkipTest
|
||||
from nose.tools import eq_
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
from kitsune.offline.cron import build_kb_bundles
|
||||
from kitsune.products.tests import product, topic
|
||||
from kitsune.sumo.tests import TestCase
|
||||
from kitsune.sumo.urlresolvers import reverse
|
||||
from kitsune.sumo.redis_utils import RedisError, redis_client
|
||||
from kitsune.wiki.models import Document
|
||||
from kitsune.wiki.tests import document, revision
|
||||
|
||||
|
||||
class OfflineViewTests(TestCase):
|
||||
|
||||
def _create_bundle(self, prod, locale=settings.WIKI_DEFAULT_LANGUAGE):
|
||||
p = product(title=prod, save=True)
|
||||
t = topic(title='topic1', product=p, save=True)
|
||||
|
||||
if locale == settings.WIKI_DEFAULT_LANGUAGE:
|
||||
parent = lambda i: None
|
||||
else:
|
||||
def parent(i):
|
||||
d = document(title='test {0} {1}'.format(locale, i),
|
||||
locale=settings.WIKI_DEFAULT_LANGUAGE,
|
||||
save=True)
|
||||
|
||||
d.products.add(p)
|
||||
d.topics.add(t)
|
||||
d.save()
|
||||
|
||||
revision(summary='test article {0}'.format(i),
|
||||
document=d,
|
||||
is_approved=True,
|
||||
save=True)
|
||||
return d
|
||||
|
||||
for i in xrange(5):
|
||||
d = document(title='test {0} {1}'.format(locale, i),
|
||||
locale=locale, save=True)
|
||||
revision(summary='test article {0}'.format(i),
|
||||
document=d,
|
||||
is_approved=True,
|
||||
save=True)
|
||||
|
||||
d.products.add(p)
|
||||
d.topics.add(t)
|
||||
d.parent = parent(i)
|
||||
d.save()
|
||||
|
||||
try:
|
||||
build_kb_bundles((prod, ))
|
||||
except RedisError:
|
||||
pass # do nothing as we should gracefully fallback.
|
||||
|
||||
def test_get_single_bundle(self):
|
||||
self._create_bundle('firefox', 'en-US')
|
||||
|
||||
url = reverse('offline.get_bundle') + '?locale=en-US&product=firefox'
|
||||
resp = self.client.get(url, follow=True)
|
||||
data = json.loads(resp.content)
|
||||
|
||||
assert 'locales' in data
|
||||
eq_(1, len(data['locales']))
|
||||
eq_([{u'slug': u'firefox', u'name': u'firefox'}],
|
||||
data['locales'][0]['products'])
|
||||
eq_('en-US', data['locales'][0]['key'])
|
||||
|
||||
assert 'topics' in data
|
||||
eq_(1, len(data['topics']))
|
||||
eq_('en-US~firefox~topic1', data['topics'][0]['key'])
|
||||
eq_(5, len(data['topics'][0]['docs']))
|
||||
|
||||
assert 'docs' in data
|
||||
eq_(5, len(data['docs']))
|
||||
|
||||
assert 'indexes' in data
|
||||
|
||||
def test_get_bundle_bad_request(self):
|
||||
url = reverse('offline.get_bundle')
|
||||
resp = self.client.get(url, follow=True)
|
||||
eq_(400, resp.status_code)
|
||||
data = json.loads(resp.content)
|
||||
eq_('bad request', data['error'])
|
||||
|
||||
def test_get_bundle_not_found(self):
|
||||
self._create_bundle('firefox', 'en-US')
|
||||
url = reverse('offline.get_bundle') + '?locale=fr&product=redpanda'
|
||||
resp = self.client.get(url, follow=True)
|
||||
eq_(404, resp.status_code)
|
||||
data = json.loads(resp.content)
|
||||
eq_('not found', data['error'])
|
||||
|
||||
def test_get_bundle_meta(self):
|
||||
self._create_bundle('firefox', 'en-US')
|
||||
url = (reverse('offline.bundle_meta') +
|
||||
'?locale=en-US&product=firefox')
|
||||
|
||||
try:
|
||||
redis_client('default')
|
||||
except RedisError:
|
||||
raise SkipTest
|
||||
|
||||
resp = self.client.get(url, follow=True)
|
||||
|
||||
meta = json.loads(resp.content)
|
||||
hash1 = meta['hash']
|
||||
assert resp['Content-Type'] == 'application/json'
|
||||
|
||||
assert len(hash1) == 40 # sha1 hexdigest should be 40 char long.
|
||||
|
||||
doc = Document.objects.all()[0] # getting one document should be okay.
|
||||
doc.title = 'some differnet title!'
|
||||
doc.save()
|
||||
|
||||
# rebuild bundle as the version is different now.
|
||||
build_kb_bundles(('firefox', ))
|
||||
|
||||
# test to see if the hash has changed.
|
||||
resp = self.client.get(url, follow=True)
|
||||
assert hash1 != json.loads(resp.content)['hash']
|
||||
|
||||
def test_get_language(self):
|
||||
self._create_bundle('firefox', 'en-US')
|
||||
|
||||
resp = self.client.get(reverse('offline.get_languages'))
|
||||
meta = json.loads(resp.content)
|
||||
|
||||
assert {'id': 'en-US', 'name': 'English'} in meta['languages']
|
|
@ -0,0 +1,9 @@
|
|||
from django.conf.urls import patterns, url
|
||||
|
||||
# Note that these url do not get considered into the locale middleware.
|
||||
# http://<base>/offline/get-bundle ... etc.
|
||||
urlpatterns = patterns('kitsune.offline.views',
|
||||
url(r'^/get-bundle$', 'get_bundle', name='offline.get_bundle'),
|
||||
url(r'^/bundle-meta$', 'bundle_meta', name='offline.bundle_meta'),
|
||||
url(r'^/get-languages$', 'get_languages', name='offline.get_languages')
|
||||
)
|
|
@ -0,0 +1,214 @@
|
|||
from hashlib import sha1
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
|
||||
from tower import ugettext as _
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
from kitsune.offline.index import (
|
||||
TFIDFIndex,
|
||||
find_word_locations_with_spaces,
|
||||
find_word_locations_without_spaces
|
||||
)
|
||||
from kitsune.wiki.config import TROUBLESHOOTING_CATEGORY, HOW_TO_CATEGORY
|
||||
from kitsune.wiki.models import Document
|
||||
|
||||
|
||||
_noscript_regex = re.compile(r'<noscript>.*?</noscript>', flags=re.DOTALL)
|
||||
|
||||
|
||||
def bundle_key(locale, product_slug):
|
||||
"""The key for a bundle as stored in client-side's indexeddb.
|
||||
|
||||
The arguments to this function must be strings. This key is used
|
||||
for the index.
|
||||
"""
|
||||
return locale + '~' + product_slug
|
||||
|
||||
|
||||
def doc_key(locale, doc_slug):
|
||||
"""The key for a document as stored in client-side's indexeddb.
|
||||
|
||||
The arguments to this function must be strings.
|
||||
"""
|
||||
return locale + '~' + doc_slug
|
||||
|
||||
|
||||
def topic_key(locale, product_slug, topic_slug):
|
||||
"""The key for a topic as stored in client-side's indexeddb.
|
||||
|
||||
The arguments to this function must be strings.
|
||||
"""
|
||||
return locale + '~' + product_slug + '~' + topic_slug
|
||||
|
||||
|
||||
def redis_bundle_name(locale, product_slug):
|
||||
return 'osumo:' + bundle_key(locale.lower(), product_slug.lower())
|
||||
|
||||
|
||||
def transform_html(dochtml):
|
||||
"""Transforms the html to something we want to serve in the app.
|
||||
|
||||
Do things to the document html such as stripping out things the
|
||||
offline app do not need. We could also do this in WikiParser,
|
||||
but this is probably easier for now.
|
||||
"""
|
||||
# Strip out all the <noscript> images
|
||||
dochtml = _noscript_regex.sub('', dochtml)
|
||||
|
||||
return dochtml
|
||||
|
||||
|
||||
def serialize_document_for_offline(doc):
|
||||
"""Grabs the document in a dictionary.
|
||||
|
||||
This method returns a document that is ready to be inserted into
|
||||
the client-side database.
|
||||
"""
|
||||
|
||||
# in order to save some space, the doc htmls and summaries are not returned
|
||||
# as archived articles are already out of date.
|
||||
if doc.is_archived:
|
||||
return {
|
||||
'key': doc_key(doc.locale, doc.slug),
|
||||
'title': doc.title,
|
||||
'archived': True,
|
||||
'slug': doc.slug
|
||||
}
|
||||
else:
|
||||
updated = int(time.mktime(doc.current_revision.created.timetuple()))
|
||||
return {
|
||||
'key': doc_key(doc.locale, doc.slug),
|
||||
'title': doc.title,
|
||||
'html': transform_html(doc.html),
|
||||
'updated': updated,
|
||||
'slug': doc.slug,
|
||||
'id': doc.id,
|
||||
'archived': False
|
||||
}
|
||||
|
||||
|
||||
def bundle_for_product(product, locale):
|
||||
"""Gets an entire bundle for a product in a locale."""
|
||||
bundle = {}
|
||||
|
||||
# put a new locale into the database.
|
||||
bundle['locales'] = {}
|
||||
bundle['locales'][locale] = {
|
||||
'key': locale,
|
||||
'name': settings.LANGUAGES[locale.lower()],
|
||||
'products': [{'slug': product.slug, 'name': product.title}]
|
||||
}
|
||||
|
||||
# we need a dictionary as we need to merge everything together.
|
||||
bundle['topics'] = topics = {}
|
||||
bundle['docs'] = docs_bundle = {}
|
||||
bundle['indexes'] = {}
|
||||
|
||||
index_builder = TFIDFIndex()
|
||||
|
||||
docs = Document.objects.filter(
|
||||
locale=locale,
|
||||
is_template=False,
|
||||
category__in=(TROUBLESHOOTING_CATEGORY, HOW_TO_CATEGORY)
|
||||
)
|
||||
|
||||
# Since the any languages that are derived from English will not have a
|
||||
# product, we must find its parent's product.
|
||||
if locale == settings.WIKI_DEFAULT_LANGUAGE:
|
||||
docs = docs.filter(products__id=product.id)
|
||||
else:
|
||||
docs = docs.filter(parent__products__id=product.id)
|
||||
|
||||
if locale in settings.LANGUAGES_WITHOUT_SPACES:
|
||||
find_word_locations = find_word_locations_without_spaces
|
||||
else:
|
||||
find_word_locations = find_word_locations_with_spaces
|
||||
|
||||
for doc in docs:
|
||||
if not doc.current_revision or not doc.html or doc.redirect_url():
|
||||
# These documents don't have approved revision. We just skip them.
|
||||
# or if it is a redirect.. why even bother.
|
||||
continue
|
||||
|
||||
serialized_doc = serialize_document_for_offline(doc)
|
||||
|
||||
# Only non-archived documents need to be indexed.
|
||||
if not doc.is_archived:
|
||||
# We only index the title and the summary as otherwise the corpus
|
||||
# is too big. We also boost the score of the title.
|
||||
texts = [(doc.title, 1.2), (doc.current_revision.summary, 1)]
|
||||
index_builder.feed(doc.id, texts, find_word_locations)
|
||||
|
||||
docs_bundle[serialized_doc['key']] = serialized_doc
|
||||
|
||||
# Now we need to populate the topics for this locale.
|
||||
for t in doc.get_topics():
|
||||
if t.product.id == product.id:
|
||||
topic = topics.setdefault(t.slug, {})
|
||||
if not topic: # this means that topics has not been set yet.
|
||||
topic['key'] = topic_key(locale, product.slug, t.slug)
|
||||
# The title of the document is not translated so we must
|
||||
# use gettext to get the translation for it.
|
||||
topic['name'] = _(t.title)
|
||||
topic['children'] = [st.slug for st in t.subtopics.all()]
|
||||
topic['docs'] = []
|
||||
topic['product'] = product.slug
|
||||
topic['slug'] = t.slug
|
||||
topic['docs'].append(doc.slug)
|
||||
|
||||
# The bundle needs an index!
|
||||
bundlekey = bundle_key(locale, product.slug)
|
||||
bundle['indexes'][bundlekey] = {}
|
||||
bundle['indexes'][bundlekey]['key'] = bundlekey
|
||||
# The client side will search through this index.
|
||||
bundle['indexes'][bundlekey]['index'] = index_builder.offline_index()
|
||||
|
||||
return bundle
|
||||
|
||||
|
||||
def merge_bundles(*bundles):
|
||||
"""Merges multiple bundles generated by bundle_for_product into one.
|
||||
"""
|
||||
merged_bundle = {}
|
||||
for bundle in bundles:
|
||||
if 'locales' in bundle:
|
||||
merged_locales = merged_bundle.setdefault('locales', {})
|
||||
for k, locale in bundle['locales'].iteritems():
|
||||
merged_locale = merged_locales.setdefault(k, {})
|
||||
if merged_locale:
|
||||
merged_locale['products'].extend(locale['products'])
|
||||
else:
|
||||
merged_locale.update(locale)
|
||||
|
||||
for key in ('topics', 'docs', 'indexes'):
|
||||
if key in bundle:
|
||||
merged_bundle.setdefault(key, {}).update(bundle[key])
|
||||
|
||||
# This is because the database format is actually meant to have all of this
|
||||
# in a list format
|
||||
for key in ('locales', 'topics', 'docs', 'indexes'):
|
||||
if key in merged_bundle:
|
||||
merged_bundle[key] = merged_bundle[key].values()
|
||||
|
||||
return merged_bundle
|
||||
|
||||
|
||||
def insert_bundle_into_redis(redis, product, locale, bundle):
|
||||
"""Put a bundle into redis.
|
||||
|
||||
This is used in both the cron job and the view.
|
||||
"""
|
||||
bundle = json.dumps(bundle)
|
||||
# track version. Used instead of a timestamp as there may be instances when
|
||||
# nothing is updated between last generation and now.
|
||||
bundle_hash = sha1(bundle).hexdigest()
|
||||
|
||||
name = redis_bundle_name(locale.lower(), product.lower())
|
||||
redis.hset(name, 'hash', bundle_hash)
|
||||
redis.hset(name, 'bundle', bundle)
|
||||
redis.hset(name, 'updated', time.time())
|
||||
|
||||
return bundle, bundle_hash
|
|
@ -0,0 +1,84 @@
|
|||
import json
|
||||
|
||||
from django.conf import settings
|
||||
from django.http import (HttpResponse,
|
||||
HttpResponseBadRequest,
|
||||
HttpResponseNotFound)
|
||||
|
||||
from kitsune.offline.utils import redis_bundle_name
|
||||
from kitsune.sumo.decorators import cors_enabled
|
||||
from kitsune.sumo.redis_utils import redis_client, RedisError
|
||||
|
||||
|
||||
INVALID_LOCALE = '{"error": "not found", "reason": "invalid locale"}'
|
||||
NOT_FOUND = '{"error": "not found", "reason": "unknown"}'
|
||||
BAD_REQUEST = '{"error": "bad request", "reason": "incomplete request"}'
|
||||
|
||||
|
||||
@cors_enabled('*')
|
||||
def get_bundle(request):
|
||||
if 'locale' not in request.GET or 'product' not in request.GET:
|
||||
return HttpResponseBadRequest(BAD_REQUEST, mimetype='application/json')
|
||||
|
||||
locale = request.GET['locale']
|
||||
product = request.GET['product']
|
||||
if locale.lower() not in settings.LANGUAGES:
|
||||
return HttpResponseNotFound(INVALID_LOCALE,
|
||||
mimetype='application/json')
|
||||
|
||||
name = redis_bundle_name(locale, product)
|
||||
try:
|
||||
redis = redis_client('default')
|
||||
except RedisError:
|
||||
return HttpResponse('not available yet', status=503)
|
||||
else:
|
||||
bundle = redis.hget(name, 'bundle')
|
||||
bundle_hash = redis.hget(name, 'hash')
|
||||
|
||||
if bundle is None:
|
||||
return HttpResponseNotFound(NOT_FOUND, mimetype='application/json')
|
||||
|
||||
response = HttpResponse(bundle, mimetype='application/json')
|
||||
response['Content-Length'] = len(bundle)
|
||||
response['X-Content-Hash'] = bundle_hash
|
||||
response['Access-Control-Expose-Headers'] = \
|
||||
'Content-Length, X-Content-Hash'
|
||||
|
||||
return response
|
||||
|
||||
|
||||
@cors_enabled('*')
|
||||
def bundle_meta(request):
|
||||
"""This view is responsible for update checking."""
|
||||
if 'locale' not in request.GET or 'product' not in request.GET:
|
||||
return HttpResponseBadRequest(BAD_REQUEST, mimetype='application/json')
|
||||
|
||||
locale = request.GET['locale']
|
||||
product = request.GET['product']
|
||||
|
||||
name = redis_bundle_name(locale, product)
|
||||
try:
|
||||
redis = redis_client('default')
|
||||
except RedisError:
|
||||
return HttpResponse('{"error": "no bundles available"}',
|
||||
mimetype='application/json',
|
||||
status=503)
|
||||
|
||||
bundle_hash = redis.hget(name, 'hash')
|
||||
|
||||
if bundle_hash:
|
||||
u = {'hash': bundle_hash}
|
||||
return HttpResponse(json.dumps(u), mimetype='application/json')
|
||||
else:
|
||||
return HttpResponseNotFound(NOT_FOUND, mimetype='application/json')
|
||||
|
||||
|
||||
@cors_enabled('*')
|
||||
def get_languages(request):
|
||||
"""Responsible for telling what the support languages are"""
|
||||
languages = []
|
||||
for code, name in settings.LANGUAGE_CHOICES:
|
||||
languages.append({'id': code, 'name': name})
|
||||
|
||||
return HttpResponse(json.dumps({'languages': languages}),
|
||||
mimetype='application/json')
|
|
@ -251,6 +251,15 @@ ES_PLUGIN_ANALYZERS = [
|
|||
|
||||
ES_USE_PLUGINS = False
|
||||
|
||||
# These are for the indexer for the offline sumo app.
|
||||
LANGUAGES_WITHOUT_SPACES = (
|
||||
'zh-CN',
|
||||
'zh-TW',
|
||||
'ja',
|
||||
'ko',
|
||||
'my'
|
||||
)
|
||||
|
||||
TEXT_DOMAIN = 'messages'
|
||||
|
||||
SITE_ID = 1
|
||||
|
@ -309,6 +318,7 @@ SUPPORTED_NONLOCALES = (
|
|||
'api',
|
||||
'favicon.ico',
|
||||
'media',
|
||||
'offline',
|
||||
'postcrash',
|
||||
'robots.txt',
|
||||
'services',
|
||||
|
@ -486,6 +496,7 @@ INSTALLED_APPS = (
|
|||
'kitsune.karma',
|
||||
'kitsune.tags',
|
||||
'kitsune.kpi',
|
||||
'kitsune.offline',
|
||||
'kitsune.products',
|
||||
'rest_framework',
|
||||
|
||||
|
|
|
@ -66,3 +66,33 @@ def json_view(f):
|
|||
})
|
||||
return http.HttpResponseServerError(blob, content_type=JSON)
|
||||
return _wrapped
|
||||
|
||||
|
||||
def cors_enabled(origin, methods=['GET']):
|
||||
"""A simple decorator to enable CORS."""
|
||||
def decorator(f):
|
||||
@wraps(f)
|
||||
def decorated_func(request, *args, **kwargs):
|
||||
if request.method == 'OPTIONS':
|
||||
# preflight
|
||||
if ('HTTP_ACCESS_CONTROL_REQUEST_METHOD' in request.META and
|
||||
'HTTP_ACCESS_CONTROL_REQUEST_HEADERS' in request.META):
|
||||
|
||||
response = http.HttpResponse()
|
||||
response['Access-Control-Allow-Methods'] = ", ".join(
|
||||
methods)
|
||||
|
||||
# TODO: We might need to change this
|
||||
response['Access-Control-Allow-Headers'] = \
|
||||
request.META['HTTP_ACCESS_CONTROL_REQUEST_HEADERS']
|
||||
else:
|
||||
return http.HttpResponseBadRequest()
|
||||
elif request.method in methods:
|
||||
response = f(request, *args, **kwargs)
|
||||
else:
|
||||
return http.HttpResponseBadRequest()
|
||||
|
||||
response['Access-Control-Allow-Origin'] = origin
|
||||
return response
|
||||
return decorated_func
|
||||
return decorator
|
||||
|
|
|
@ -35,6 +35,7 @@ urlpatterns = patterns('',
|
|||
(r'^products', include('kitsune.products.urls')),
|
||||
(r'^announcements', include('kitsune.announcements.urls')),
|
||||
(r'^badges/', include('kitsune.kbadge.urls')),
|
||||
(r'^offline', include('kitsune.offline.urls')),
|
||||
|
||||
# Kitsune admin (not Django admin).
|
||||
(r'^admin/', include(admin.site.urls)),
|
||||
|
|
|
@ -34,6 +34,7 @@ HOME = /tmp
|
|||
0 5 * * * {{ cron }} reindex_kb
|
||||
0 6 * * * {{ cron }} process_exit_surveys
|
||||
0 1 * * * {{ cron }} update_l10n_coverage_metrics
|
||||
45 4 * * * {{ cron }} build_kb_bundles
|
||||
|
||||
# Twice per week.
|
||||
#05 01 * * 1,4 {{ cron }} update_weekly_votes
|
||||
|
|
Загрузка…
Ссылка в новой задаче