universal-search-recommenda.../prepopulate.py

75 строки
2.0 KiB
Python
Executable File

#!/usr/bin/env python
import re
from os import path
from sys import stdout
from recommendation.tasks.task_recommend import recommend
CWD = path.dirname(path.realpath(__file__))
MAX_LENGTH = 10
QUEUE = []
def explode(string):
"""
Returns an iterator that yields every possible substring of that string,
starting with length 1 and ending with length MAX_LENGTH.
Example:
>>> list(explode('Firefox'))
['F', 'Fi', 'Fir', 'Fire', 'Firef', 'Firefo', 'Firefox']
>>> list(explode('Mozilla Firefox'))
['M', 'Mo', 'Moz', 'Mozi', 'Mozil', 'Mozill', 'Mozilla', 'Mozilla ',
'Mozilla F', 'Mozilla Fi']
"""
n = MAX_LENGTH if len(string) > MAX_LENGTH else len(string)
for i in range(n):
yield string[:(i + 1)]
def queue(query):
"""
Explodes a string, then queues a task to generate a recommendation for each
item yielded by that string unless this script has already done so.
"""
for q in explode(query):
if q and q not in QUEUE:
QUEUE.append(q)
recommend.delay(q)
def wikipedia():
"""
Opens the Wikipedia data file, which contains the names of the 250 most
popular articles, and queues every unique string it contains.
Source: https://en.wikipedia.org/wiki/User:West.andrew.g/Popular_pages
Retrieved: 18/04/2016
"""
with open(path.join(CWD, 'data', 'wikipedia.txt'), 'r') as f:
for article in f.readlines():
queue(article)
def alexa():
"""
Opens the Alexa data file, which contains the 1000 top domain names on the
internet, and queues every unique string it contains.
Source: http://s3.amazonaws.com/alexa-static/top-1m.csv.zip
Retrieved: 18/04/2016
"""
with open(path.join(CWD, 'data', 'alexa.txt'), 'r') as f:
for domain in f.readlines():
base = re.match(r'([^\.]+)', domain)
if base.groups():
queue(base.group(0))
if __name__ == '__main__':
wikipedia()
alexa()
stdout.write('{:d} items queued.\n'.format(len(QUEUE)))