kitsune/configs/sphinx/sphinx.conf

377 строки
13 KiB
Plaintext
Executable File

#!/usr/bin/env python26
ID_FACTOR = 6
try:
from localsettings import *
except ImportError:
from localsettings_django import *
#############################################################################
## data source definition
#############################################################################
MYSQL = """
type = mysql
sql_host = {sql_host}
sql_user = {sql_user}
sql_pass = {sql_pass}
sql_db = {sql_db}
sql_query_pre = SET NAMES utf8
sql_query_pre = SET SESSION query_cache_type = OFF
""".format(sql_host = MYSQL_HOST,sql_user = MYSQL_USER,
sql_pass = MYSQL_PASS,sql_db=MYSQL_NAME)
CHARSET_TABLE = """
charset_table = U+FF10..U+FF19->0..9, 0..9, U+FF41..U+FF5A->a..z, U+FF21..U+FF3A->a..z,\
A..Z->a..z, a..z, U+0149, U+017F, U+0138, U+00DF, U+00FF, U+00C0..U+00D6->U+00E0..U+00F6,\
U+00E0..U+00F6, U+00D8..U+00DE->U+00F8..U+00FE, U+00F8..U+00FE, U+0100->U+0101, U+0101,\
U+0102->U+0103, U+0103, U+0104->U+0105, U+0105, U+0106->U+0107, U+0107, U+0108->U+0109,\
U+0109, U+010A->U+010B, U+010B, U+010C->U+010D, U+010D, U+010E->U+010F, U+010F,\
U+0110->U+0111, U+0111, U+0112->U+0113, U+0113, U+0114->U+0115, U+0115, U+0116->U+0117,\
U+0117, U+0118->U+0119, U+0119, U+011A->U+011B, U+011B, U+011C->U+011D, U+011D,\
U+011E->U+011F, U+011F, U+0130->U+0131, U+0131, U+0132->U+0133, U+0133, U+0134->U+0135,\
U+0135, U+0136->U+0137, U+0137, U+0139->U+013A, U+013A, U+013B->U+013C, U+013C,\
U+013D->U+013E, U+013E, U+013F->U+0140, U+0140, U+0141->U+0142, U+0142, U+0143->U+0144,\
U+0144, U+0145->U+0146, U+0146, U+0147->U+0148, U+0148, U+014A->U+014B, U+014B,\
U+014C->U+014D, U+014D, U+014E->U+014F, U+014F, U+0150->U+0151, U+0151, U+0152->U+0153,\
U+0153, U+0154->U+0155, U+0155, U+0156->U+0157, U+0157, U+0158->U+0159, U+0159,\
U+015A->U+015B, U+015B, U+015C->U+015D, U+015D, U+015E->U+015F, U+015F, U+0160->U+0161,\
U+0161, U+0162->U+0163, U+0163, U+0164->U+0165, U+0165, U+0166->U+0167, U+0167,\
U+0168->U+0169, U+0169, U+016A->U+016B, U+016B, U+016C->U+016D, U+016D, U+016E->U+016F,\
U+016F, U+0170->U+0171, U+0171, U+0172->U+0173, U+0173, U+0174->U+0175, U+0175,\
U+0176->U+0177, U+0177, U+0178->U+00FF, U+00FF, U+0179->U+017A, U+017A, U+017B->U+017C,\
U+017C, U+017D->U+017E, U+017E, U+0410..U+042F->U+0430..U+044F, U+0430..U+044F,\
U+05D0..U+05EA, U+0531..U+0556->U+0561..U+0586, U+0561..U+0587, U+0621..U+063A, U+01B9,\
U+01BF, U+0640..U+064A, U+0660..U+0669, U+066E, U+066F, U+0671..U+06D3, U+06F0..U+06FF,\
U+0904..U+0939, U+0958..U+095F, U+0960..U+0963, U+0966..U+096F, U+097B..U+097F,\
U+0985..U+09B9, U+09CE, U+09DC..U+09E3, U+09E6..U+09EF, U+0A05..U+0A39, U+0A59..U+0A5E,\
U+0A66..U+0A6F, U+0A85..U+0AB9, U+0AE0..U+0AE3, U+0AE6..U+0AEF, U+0B05..U+0B39,\
U+0B5C..U+0B61, U+0B66..U+0B6F, U+0B71, U+0B85..U+0BB9, U+0BE6..U+0BF2, U+0C05..U+0C39,\
U+0C66..U+0C6F, U+0C85..U+0CB9, U+0CDE..U+0CE3, U+0CE6..U+0CEF, U+0D05..U+0D39, U+0D60,\
U+0D61, U+0D66..U+0D6F, U+0D85..U+0DC6, U+1900..U+1938, U+1946..U+194F, U+A800..U+A805,\
U+A807..U+A822, U+0386->U+03B1, U+03AC->U+03B1, U+0388->U+03B5, U+03AD->U+03B5,\
U+0389->U+03B7, U+03AE->U+03B7, U+038A->U+03B9, U+0390->U+03B9, U+03AA->U+03B9,\
U+03AF->U+03B9, U+03CA->U+03B9, U+038C->U+03BF, U+03CC->U+03BF, U+038E->U+03C5,\
U+03AB->U+03C5, U+03B0->U+03C5, U+03CB->U+03C5, U+03CD->U+03C5, U+038F->U+03C9,\
U+03CE->U+03C9, U+03C2->U+03C3, U+0391..U+03A1->U+03B1..U+03C1,\
U+03A3..U+03A9->U+03C3..U+03C9, U+03B1..U+03C1, U+03C3..U+03C9, U+0E01..U+0E2E,\
U+0E30..U+0E3A, U+0E40..U+0E45, U+0E47, U+0E50..U+0E59, U+A000..U+A48F, U+4E00..U+9FBF,\
U+3400..U+4DBF, U+20000..U+2A6DF, U+F900..U+FAFF, U+2F800..U+2FA1F, U+2E80..U+2EFF,\
U+2F00..U+2FDF, U+3100..U+312F, U+31A0..U+31BF, U+3040..U+309F, U+30A0..U+30FF,\
U+31F0..U+31FF, U+AC00..U+D7AF, U+1100..U+11FF, U+3130..U+318F, U+A000..U+A48F,\
U+A490..U+A4CF
"""
NGRAM_CHARS = """
ngram_len = 1
ngram_chars = U+4E00..U+9FBB, U+3400..U+4DB5, U+20000..U+2A6D6, U+FA0E, \
U+FA0F, U+FA11, U+FA13, U+FA14, U+FA1F, U+FA21, U+FA23, U+FA24, U+FA27, \
U+FA28, U+FA29, U+3105..U+312C, U+31A0..U+31B7, U+3041, U+3043, U+3045, \
U+3047, U+3049, U+304B, U+304D, U+304F, U+3051, U+3053, U+3055, U+3057, \
U+3059, U+305B, U+305D, U+305F, U+3061, U+3063, U+3066, U+3068, \
U+306A..U+306F, U+3072, U+3075, U+3078, U+307B, U+307E..U+3083, U+3085, \
U+3087, U+3089..U+308E, U+3090..U+3093, U+30A1, U+30A3, U+30A5, U+30A7, \
U+30A9, U+30AD, U+30AF, U+30B3, U+30B5, U+30BB, U+30BD, U+30BF, U+30C1, \
U+30C3, U+30C4, U+30C6, U+30CA, U+30CB, U+30CD, U+30CE, U+30DE, U+30DF, \
U+30E1, U+30E2, U+30E3, U+30E5, U+30E7, U+30EE, U+30F0..U+30F3, U+30F5, \
U+30F6, U+31F0, U+31F1, U+31F2, U+31F3, U+31F4, U+31F5, U+31F6, U+31F7, \
U+31F8, U+31F9, U+31FA, U+31FB, U+31FC, U+31FD, U+31FE, U+31FF, \
U+AC00..U+D7A3, U+1100..U+1159, U+1161..U+11A2, U+11A8..U+11F9, \
U+A000..U+A48C, U+A492..U+A4C6
"""
config = """
source questions
{{
{mysql}
sql_query = \
SELECT \
IF(a.id, q.id * 10e{n} + a.id, q.id * 10e{n}) AS id, \
q.id AS question_id, \
q.title AS title, \
q.content AS question_content, \
a.content AS answer_content, \
q.num_answers AS replies, \
IF(q.num_answers, 1, 0) AS has_answers, \
IF(\
(SELECT \
COUNT(helpful) \
FROM \
questions_answervote av\
WHERE \
av.answer_id = a.id \
AND helpful = 1), \
1, 0) AS has_helpful, \
q.status AS status, \
IF(q.solution_id, 1, 0) AS is_solved, \
q.is_locked AS is_locked, \
UNIX_TIMESTAMP(q.created) AS created, \
UNIX_TIMESTAMP(q.updated) AS updated, \
(\
SELECT \
CRC32(username) \
FROM \
auth_user \
WHERE \
q.creator_id = auth_user.id \
) AS question_creator, \
(\
SELECT \
CRC32(username) \
FROM \
auth_user \
WHERE \
a.creator_id = auth_user.id \
) AS answer_creator, \
q.num_votes_past_week AS question_votes, \
a.upvotes AS answer_votes, \
(UNIX_TIMESTAMP() - q.updated)/{age_unit} AS age \
FROM \
questions_question q \
LEFT JOIN \
questions_answer a ON a.question_id = q.id
sql_attr_uint = question_id
sql_attr_uint = replies
sql_attr_uint = status
sql_attr_bool = is_solved
sql_attr_bool = is_locked
sql_attr_bool = has_answers
sql_attr_bool = has_helpful
sql_attr_timestamp = created
sql_attr_timestamp = updated
sql_attr_uint = question_creator
sql_attr_uint = answer_creator
sql_attr_uint = question_votes
sql_attr_uint = answer_votes
sql_attr_uint = age
sql_attr_multi = uint tag from query; SELECT \
IF(a.id, q.id * 10e{n} + a.id, q.id * 10e{n}) AS id, \
CRC32(t.name) \
FROM \
questions_answer a \
RIGHT JOIN \
questions_question q \
ON q.id = a.question_id \
INNER JOIN \
taggit_taggeditem ti \
ON q.id = ti.object_id AND \
ti.content_type_id = (\
SELECT \
id \
FROM \
django_content_type \
WHERE \
app_label = 'questions' AND \
model = 'question'\
) \
LEFT JOIN \
taggit_tag t \
ON t.id = ti.tag_id
}}
""".format(mysql=MYSQL, age_unit=AGE_DIVISOR, n=ID_FACTOR)
config = config + """
index questions
{{
source = questions
path = {root_path}{catalog_path}/questions-catalog
charset_type = utf-8
morphology = stem_en
min_stemming_len = 4
stopwords = {root_path}/stopwords.txt
wordforms = {root_path}/wordforms.txt
{charset}
{ngram}
}}
""".format(root_path=ROOT_PATH, catalog_path=CATALOG_PATH,
charset=CHARSET_TABLE, ngram=NGRAM_CHARS)
config = config + """
source discussion_forums
{{
{mysql}
sql_query = \
SELECT \
post.id, \
post.thread_id AS thread_id, \
thread.forum_id AS forum_id, \
thread.title AS title, \
thread.is_sticky AS is_sticky, \
thread.is_locked AS is_locked, \
post.author_id AS author_id, \
CRC32(author.username) AS author_ord, \
post.content AS content, \
UNIX_TIMESTAMP(thread.created) AS created, \
( \
SELECT \
UNIX_TIMESTAMP(forums_post.created) \
FROM \
forums_post \
WHERE \
forums_post.id = thread.last_post_id \
) AS updated, \
(UNIX_TIMESTAMP() - post.updated)/{age_unit} AS age, \
thread.replies AS replies \
FROM \
forums_post AS post \
INNER JOIN \
forums_thread AS thread ON (post.thread_id = thread.id) \
INNER JOIN \
auth_user AS author ON (post.author_id = author.id)
sql_attr_uint = thread_id
sql_attr_uint = forum_id
sql_attr_bool = is_sticky
sql_attr_bool = is_locked
sql_attr_uint = author_id
sql_attr_uint = author_ord
sql_attr_timestamp = created
sql_attr_timestamp = updated
sql_attr_uint = age
sql_attr_uint = replies
}}
""".format(mysql=MYSQL, age_unit=AGE_DIVISOR)
config = config + """
index discussion_forums
{{
source = discussion_forums
path = {root_path}{catalog_path}/discussion-forums-catalog
charset_type = utf-8
morphology = stem_en
min_stemming_len = 4
stopwords = {root_path}/stopwords.txt
wordforms = {root_path}/wordforms.txt
{charset}
{ngram}
}}
""".format(root_path=ROOT_PATH, catalog_path=CATALOG_PATH,
charset=CHARSET_TABLE, ngram=NGRAM_CHARS)
config = config + """
source wiki_pages
{{
{mysql}
sql_query = \
SELECT \
d.id, \
d.title, \
CRC32(d.locale) AS locale, \
d.current_revision_id AS current, \
d.parent_id, \
d.html AS content, \
d.category, \
d.slug, \
r.summary, \
r.keywords, \
UNIX_TIMESTAMP(r.created) AS updated \
FROM \
wiki_document d \
INNER JOIN \
wiki_revision r \
ON r.id = d.current_revision_id \
WHERE \
r.is_approved = 1 AND \
content NOT LIKE 'REDIRECT [%'
sql_attr_timestamp = updated
sql_attr_uint = locale
sql_attr_uint = category
sql_attr_multi = uint tag from query; SELECT \
d.id, \
CRC32(t.name) \
FROM \
wiki_document d \
INNER JOIN \
taggit_taggeditem ti \
ON d.id = ti.object_id AND \
ti.content_type_id = (\
SELECT \
id \
FROM \
django_content_type \
WHERE \
app_label = 'wiki' AND \
model = 'document'\
) \
LEFT JOIN \
taggit_tag t \
ON t.id = ti.tag_id
sql_attr_multi = uint fx from query; SELECT \
d.id, \
item_id \
FROM ( \
SELECT \
id, \
IF(parent_id, parent_id, id) AS joiner \
FROM \
wiki_document \
) d \
JOIN \
wiki_firefoxversion f \
ON \
d.joiner = f.document_id;
sql_attr_multi = uint os from query; SELECT \
d.id, \
item_id \
FROM ( \
SELECT \
id, \
IF(parent_id, parent_id, id) AS joiner \
FROM \
wiki_document \
) d \
JOIN \
wiki_operatingsystem f \
ON \
d.joiner = f.document_id;
}}
""".format(mysql=MYSQL)
config = config + """
index wiki_pages
{{
source = wiki_pages
path = {root_path}{catalog_path}/wiki-page-catalog
charset_type = utf-8
morphology = stem_en
min_stemming_len = 4
stopwords = {root_path}/stopwords.txt
wordforms = {root_path}/wordforms.txt
html_strip = 1
html_index_attrs = image=alt,title; a=title;
html_remove_elements = script, style, object, embed, param
{charset}
{ngram}
}}
""".format(root_path=ROOT_PATH, catalog_path=CATALOG_PATH,
charset=CHARSET_TABLE, ngram=NGRAM_CHARS)
config = config + """
searchd
{{
listen = {listen_port}
listen = {listen_sql_host}:{listen_sql_port}:mysql41
log = {root_path}{log_path}/searchd.log
pid_file = {root_path}{etc_path}/searchd.pid
}}
""".format(listen_port=LISTEN_PORT,root_path=ROOT_PATH,log_path=LOG_PATH,
etc_path=ETC_PATH,listen_sql_host=LISTEN_SQL_HOST,
listen_sql_port=LISTEN_SQL_PORT,)
print config