make the strings unique, use a custom analyzer

2011-07-07 12:07:59 -07:00 · 2011-07-07 12:07:59 -07:00 · 47bc4cb63d
--- a/apps/addons/search.py
+++ b/apps/addons/search.py
@ -22,9 +22,10 @@ def extract(addon):
    d = dict(zip(attrs, attrgetter(*attrs)(addon)))
    # Coerce the Translation into a string.
    d['name_sort'] = unicode(addon.name).lower()
-    d['name'] = [string for _, string in addon.translations[addon.name_id]]
-    d['description'] = [string for
-                        _, string in addon.translations[addon.description_id]]
+    translations = addon.translations
+    d['name'] = list(set(string for _, string in translations[addon.name_id]))
+    d['description'] = list(set(string for
+                                _, string in translations[addon.description_id]))
    d['app'] = [a.id for a in addon.compatible_apps]
    # This is an extra query, not good for perf.
    d['category'] = getattr(addon, 'category_ids', [])
@ -38,6 +39,7 @@ def setup_mapping():
    m = {
        # Turn off analysis on name so we can sort by it.
        'name_sort': {'type': 'string', 'index': 'not_analyzed'},
+        'name': {'type': 'string', 'analyzer': 'standardPlusWordDelimiter'},
    }
    es = elasticutils.get_es()
    try:
--- a/docs/topics/elasticsearch.rst
+++ b/docs/topics/elasticsearch.rst
@ -17,3 +17,26 @@ The reindex job uses celery to parallelize indexing. Running the job multiple
 times will replace old index items with a new document.

 The index is maintained incrementally through post_save and post_delete hooks.
+
+
+Settings
+--------
+
+We use a custom analyzer for indexing add-on names since they're a little
+different from normal text. To get the same results as our servers, put this in
+your elasticsearch.yml::
+
+    index:
+      analysis:
+        analyzer:
+          standardPlusWordDelimiter:
+            type: custom
+            tokenizer: standard
+            filter: [standard, wordDelim, lowercase, stop]
+        filter:
+          wordDelim:
+            type: word_delimiter
+            preserve_original: true
+
+If you don't do this your results will be slightly different, but you probably
+wouldn't notice.