Incremental question update and fixes

* unhardcodes indexes
* ditches ElasticMeta--elasticutils uses Model._meta.db_table for
  queries, so we should use that for building indexes, too
* implements incremental updates for Question and adds test
* adds some test harness code for future elastic tests
* adds basic docs for setting up Elastic
This commit is contained in:
Will Kahn-Greene 2011-11-23 11:03:57 -05:00
Родитель 5ba45381e7
Коммит f7f5bf5a2f
13 изменённых файлов: 183 добавлений и 47 удалений

Просмотреть файл

@ -66,8 +66,8 @@ def setup_mapping(index):
# TODO: If the mapping is there already and we do a put_mapping,
# does that stomp on the existing mapping or raise an error?
try:
es.put_mapping(Post.ElasticMeta.type, mapping, index)
except pyes.ElasticSearchException, e:
es.put_mapping(Post._meta.db_table, mapping, index)
except pyes.exceptions.ElasticSearchException, e:
log.error(e)
@ -110,12 +110,12 @@ def index_post(post, bulk=False, force_insert=False, es=None):
index = settings.ES_INDEXES['default']
try:
es.index(post, index, doc_type=Post.ElasticMeta.type,
es.index(post, index, doc_type=Post._meta.db_table,
id=post['id'], bulk=bulk, force_insert=force_insert)
except pyes.urllib3.TimeoutError:
# If we have a timeout, try it again rather than die. If we
# have a second one, that will cause everything to die.
es.index(post, index, doc_type=Post.ElasticMeta.type,
es.index(post, index, doc_type=Post._meta.db_table,
id=post['id'], bulk=bulk, force_insert=force_insert)
@ -126,12 +126,12 @@ def reindex_documents():
from forums.models import Post
from django.conf import settings
index = settings.ES_INDEXES['default']
index = (settings.ES_INDEXES.get(Post._meta.db_table)
or settings.ES_INDEXES['default'])
log.info('reindex posts: %s %s', index,
Post.ElasticMeta.type)
log.info('reindex posts: %s %s', index, Post._meta.db_table)
es = pyes.ES(settings.ES_HOSTS, timeout=4.0)
es = pyes.ES(settings.ES_HOSTS, timeout=10.0)
log.info('setting up mapping....')
setup_mapping(index)

Просмотреть файл

@ -186,9 +186,6 @@ class Post(ActionMixin, ModelBase):
class Meta:
ordering = ['created']
class ElasticMeta(object):
type = 'forums'
class SphinxMeta(object):
index = 'discussion_forums'
filter_mapping = {'author_ord': crc32}

Просмотреть файл

@ -73,8 +73,8 @@ def setup_mapping(index):
# TODO: If the mapping is there already and we do a put_mapping,
# does that stomp on the existing mapping or raise an error?
try:
es.put_mapping(Question.ElasticMeta.type, mapping, index)
except pyes.ElasticSearchException, e:
es.put_mapping(Question._meta.db_table, mapping, index)
except pyes.exceptions.ElasticSearchException, e:
log.error(e)
@ -164,14 +164,16 @@ def index_doc(doc, bulk=False, force_insert=False, es=None):
if es is None:
es = elasticutils.get_es()
index = settings.ES_INDEXES['default']
index = (settings.ES_INDEXES.get(Question._meta.db_table)
or settings.ES_INDEXES['default'])
try:
es.index(doc, index, doc_type=Question.ElasticMeta.type,
es.index(doc, index, doc_type=Question._meta.db_table,
id=doc['id'], bulk=bulk, force_insert=force_insert)
except pyes.urllib3.TimeoutError:
# If we have a timeout, try it again rather than die. If we
# have a second one, that will cause everything to die.
es.index(doc, index, doc_type=Question.ElasticMeta.type,
es.index(doc, index, doc_type=Question._meta.db_table,
id=doc['id'], bulk=bulk, force_insert=force_insert)
@ -190,9 +192,9 @@ def reindex_questions():
index = settings.ES_INDEXES['default']
log.info('reindex questions: %s %s', index,
Question.ElasticMeta.type)
Question._meta.db_table)
es = pyes.ES(settings.ES_HOSTS, timeout=4.0)
es = pyes.ES(settings.ES_HOSTS, timeout=10.0)
log.info('setting up mapping....')
setup_mapping(index)

Просмотреть файл

@ -8,6 +8,7 @@ from django.contrib.contenttypes.models import ContentType
from django.core.cache import cache
from django.db import models
from django.db.models.signals import post_save
from django.dispatch import receiver
from product_details import product_details
from redis.exceptions import ConnectionError
@ -21,7 +22,7 @@ import questions as constants
from questions.karma_actions import AnswerAction, SolutionAction
from questions.question_config import products
from questions.tasks import (update_question_votes, update_answer_pages,
log_answer)
log_answer, index_questions)
from search import S
from search.utils import crc32
from sumo.helpers import urlparams
@ -65,9 +66,6 @@ class Question(ModelBase, BigVocabTaggableMixin):
'Can change/remove the solution to a question'),
)
class ElasticMeta(object):
type = 'question'
class SphinxMeta(object):
index = 'questions'
filter_mapping = {
@ -249,6 +247,13 @@ class Question(ModelBase, BigVocabTaggableMixin):
return Answer.objects.filter(pk=self.solution_id).exists()
@receiver(post_save, sender=Question,
dispatch_uid='questions.search.index')
def update_question_search_index(sender, instance, **kw):
# TODO: waffle here
index_questions.delay([instance.id])
class QuestionMetaData(ModelBase):
"""Metadata associated with a support question."""
question = models.ForeignKey('Question', related_name='metadata_set')

Просмотреть файл

@ -86,3 +86,11 @@ def log_answer(answer):
FirstAnswerAction(answer.creator, answer.created.date()).save()
unpin_this_thread()
@task
def index_questions(ids, **kw):
from questions import es_search
from questions.models import Question
for q in Question.uncached.filter(id__in=ids):
es_search.index_docs(es_search.extract_question(q))

Просмотреть файл

@ -3,7 +3,10 @@ from datetime import datetime
from django.conf import settings
from django.template.defaultfilters import slugify
from elasticutils import get_es
from nose.tools import eq_
from nose import SkipTest
from questions.models import Question
from sumo.tests import LocalizingClient, TestCase
@ -39,3 +42,28 @@ def tags_eq(tagged_object, tag_names):
"""Assert that the names of the tags on tagged_object are tag_names."""
eq_(sorted([t.name for t in tagged_object.tags.all()]),
sorted(tag_names))
# TODO: Have to define this here, since I need the data that TestCaseBase
# generates. Should we turn ESTestCase into a mixin?
class ESTestCase(TestCaseBase):
@classmethod
def setUpClass(cls):
super(ESTestCase, cls).setUpClass()
if getattr(settings, 'ES_HOSTS', None) is None:
raise SkipTest
# Delete test indexes if they exist.
cls.es = get_es()
for index in settings.ES_INDEXES.values():
cls.es.delete_index_if_exists(index)
from search.utils import es_reindex
es_reindex()
@classmethod
def tearDownClass(cls):
for index in settings.ES_INDEXES.values():
cls.es.delete_index_if_exists(index)
super(ESTestCase, cls).tearDownClass()

Просмотреть файл

@ -0,0 +1,24 @@
from questions.tests import ESTestCase
from questions.models import Question
import elasticutils
import uuid
from nose.tools import eq_
class TestQuestionUpdate(ESTestCase):
def test_added(self):
# Use a uuid since it's "unique" and makes sure we're not
# accidentally picking up a Question we don't want.
title = str(uuid.uuid4())
question = Question(title=title,
content='Lorem Ipsum Dolor',
creator_id=118533)
# Assert that it's not in the index before saving.
eq_(elasticutils.S(Question).query(title=title).count(), 0)
question.save()
eq_(elasticutils.S(Question).query(title=title).count(), 0)

Просмотреть файл

@ -41,16 +41,17 @@ def es_reindex():
es = elasticutils.get_es()
# TODO: unhardcode this
es.delete_index("sumo")
# Go through and delete, then recreate the indexes.
for index in settings.ES_INDEXES.values():
es.delete_index_if_exists(index)
try:
es.create_index_if_missing("sumo")
except pyes.exceptions.ElasticSearchException:
# TODO: Why would this throw an exception? We should handle
# it. Maybe Elastic isn't running or something in which case
# proceeding is an exercise in futility.
pass
try:
es.create_index_if_missing(index)
except pyes.exceptions.ElasticSearchException:
# TODO: Why would this throw an exception? We should handle
# it. Maybe Elastic isn't running or something in which case
# proceeding is an exercise in futility.
pass
# Reindex questions.
import questions.es_search

Просмотреть файл

@ -68,6 +68,11 @@ class TestCase(test_utils.TestCase):
super(TestCase, self).setUp()
settings.REDIS_BACKENDS = settings.REDIS_TEST_BACKENDS
@classmethod
def setUpClass(cls):
super(TestCase, cls).setUpClass()
settings.ES_INDEXES = settings.TEST_ES_INDEXES
class MigrationTests(TestCase):
"""Sanity checks for the SQL migration scripts"""
@ -116,7 +121,6 @@ class MigrationTests(TestCase):
class MobileTestCase(TestCase):
def setUp(self):
super(MobileTestCase, self).setUp()
self.client.cookies[settings.MOBILE_COOKIE] = 'on'

Просмотреть файл

@ -55,8 +55,8 @@ def setup_mapping(index):
# TODO: If the mapping is there already and we do a put_mapping,
# does that stomp on the existing mapping or raise an error?
try:
es.put_mapping(Document.ElasticMeta.type, mapping, index)
except pyes.ElasticSearchException, e:
es.put_mapping(Document._meta.db_table, mapping, index)
except pyes.exceptions.ElasticSearchException, e:
log.error(e)
@ -91,14 +91,16 @@ def index_doc(doc, bulk=False, force_insert=False, es=None):
if es is None:
es = elasticutils.get_es()
index = settings.ES_INDEXES['default']
index = (settings.ES_INDEXES.get(Document._meta.db_table)
or settings.ES_INDEXES['default'])
try:
es.index(doc, index, doc_type=Document.ElasticMeta.type,
es.index(doc, index, doc_type=Document._meta.db_table,
id=doc['id'], bulk=bulk, force_insert=force_insert)
except pyes.urllib3.TimeoutError:
# If we have a timeout, try it again rather than die. If we
# have a second one, that will cause everything to die.
es.index(doc, index, doc_type=Document.ElasticMeta.type,
es.index(doc, index, doc_type=Document._meta.db_table,
id=doc['id'], bulk=bulk, force_insert=force_insert)
@ -111,10 +113,9 @@ def reindex_documents():
index = settings.ES_INDEXES['default']
log.info('reindex documents: %s %s', index,
Document.ElasticMeta.type)
log.info('reindex documents: %s %s', index, Document._meta.db_table)
es = pyes.ES(settings.ES_HOSTS, timeout=4.0)
es = pyes.ES(settings.ES_HOSTS, timeout=10.0)
log.info('setting up mapping....')
setup_mapping(index)

Просмотреть файл

@ -248,9 +248,6 @@ class Document(NotificationsMixin, ModelBase, BigVocabTaggableMixin):
('slug', 'locale'))
permissions = [('archive_document', 'Can archive document')]
class ElasticMeta(object):
type = 'wiki'
class SphinxMeta(object):
index = 'wiki_pages'
filter_mapping = {

Просмотреть файл

@ -4,11 +4,13 @@
Search
======
Kitsune uses `Sphinx Search <http://www.sphinxsearch.com>`_ to power its
on-site search facility.
Kitsune is in the process of switching from `Sphinx Search
<http://www.sphinxsearch.com>`_ to `Elastic Search
<http://www.elasticsearch.org/>`_ to power its on-site search
facility.
Sphinx search gives us a number of advantages over MySQL's full-text search or
Google's site search.
Both of these give us a number of advantages over MySQL's full-text
search or Google's site search.
* Much faster than MySQL.
* And reduces load on MySQL.
@ -17,6 +19,17 @@ Google's site search.
* We don't rely on Google reindexing the site.
* We can fine-tune the algorithm ourselves.
.. Note::
Right now we're rewriting our search system to use Elastic and
switching between Sphinx and Elastic. At some point, the results
we're getting with our Elastic-based code will be good enough to
switch over. At that point, we'll remove the Sphinx-based search
code.
Until then, we have instructions for installing both Sphinx Search
and Elastic Search.
Installing Sphinx Search
========================
@ -118,3 +131,58 @@ You can also stop ``searchd``::
This method not only lets you maintain a running Sphinx instance that doesn't
get wiped out by the tests, but also lets you see some very interesting output
from Sphinx about indexing rate and statistics.
Installing Elastic Search
=========================
There's an installation guide on the Elastic Search site.
http://www.elasticsearch.org/guide/reference/setup/installation.html
The directory you install Elastic in will hereafter be referred to as
``ELASTICDIR``.
You can configure Elastic Search with the configuration file at
``ELASTICDIR/config/elasticsearch.yml``.
Elastic Search uses three settings in ``settings.py`` that you can
override in ``settings_local.py``::
# Connection information for Elastic
ES_HOSTS = ['127.0.0.1:9200']
ES_INDEXES = {'default': 'sumo'}
TEST_ES_INDEXES = {'default': 'sumo_test'}
.. Warning::
The host setting must match the host and port in
``ELASTICDIR/config/elasticsearch.yml``. So if you change it in
one place, you must also change it in the other.
.. Warning::
Make sure the index name values in ``ES_INDEXES`` and
``TEST_ES_INDEXES`` are **not** the same. If they are the same,
then running unit tests will nix your index.
Using Elastic Search
====================
Start Elastic Search by::
$ ELASTICDIR/bin/elasticsearch
That launches Elastic Search in the background.
Do a complete reindexing of everything by::
$ ./manage.py esreindex
This will delete the existing indexes, create new ones, and reindex
everything in your database. On my machine it takes about 30 minutes.
You can see Elastic Search statistics/health with::
$ ./manage.py eswhazzup

Просмотреть файл

@ -575,6 +575,7 @@ SESSION_EXISTS_COOKIE = 'sumo_session'
# Connection information for Elastic
ES_HOSTS = ['127.0.0.1:9200']
ES_INDEXES = {'default': 'sumo'}
TEST_ES_INDEXES = {'default': 'sumo_test'}
#
# Connection information for Sphinx search