75 строки
2.0 KiB
Python
75 строки
2.0 KiB
Python
|
#!/usr/bin/env python
|
||
|
|
||
|
import re
|
||
|
from os import path
|
||
|
from sys import stdout
|
||
|
|
||
|
from recommendation.tasks.task_recommend import recommend
|
||
|
|
||
|
|
||
|
CWD = path.dirname(path.realpath(__file__))
|
||
|
MAX_LENGTH = 10
|
||
|
QUEUE = []
|
||
|
|
||
|
|
||
|
def explode(string):
|
||
|
"""
|
||
|
Returns an iterator that yields every possible substring of that string,
|
||
|
starting with length 1 and ending with length MAX_LENGTH.
|
||
|
|
||
|
Example:
|
||
|
>>> list(explode('Firefox'))
|
||
|
['F', 'Fi', 'Fir', 'Fire', 'Firef', 'Firefo', 'Firefox']
|
||
|
>>> list(explode('Mozilla Firefox'))
|
||
|
['M', 'Mo', 'Moz', 'Mozi', 'Mozil', 'Mozill', 'Mozilla', 'Mozilla ',
|
||
|
'Mozilla F', 'Mozilla Fi']
|
||
|
"""
|
||
|
n = MAX_LENGTH if len(string) > MAX_LENGTH else len(string)
|
||
|
for i in range(n):
|
||
|
yield string[:(i + 1)]
|
||
|
|
||
|
|
||
|
def queue(query):
|
||
|
"""
|
||
|
Explodes a string, then queues a task to generate a recommendation for each
|
||
|
item yielded by that string unless this script has already done so.
|
||
|
"""
|
||
|
for q in explode(query):
|
||
|
if q and q not in QUEUE:
|
||
|
QUEUE.append(q)
|
||
|
recommend.delay(q)
|
||
|
|
||
|
|
||
|
def wikipedia():
|
||
|
"""
|
||
|
Opens the Wikipedia data file, which contains the names of the 250 most
|
||
|
popular articles, and queues every unique string it contains.
|
||
|
|
||
|
Source: https://en.wikipedia.org/wiki/User:West.andrew.g/Popular_pages
|
||
|
Retrieved: 18/04/2016
|
||
|
"""
|
||
|
with open(path.join(CWD, 'data', 'wikipedia.txt'), 'r') as f:
|
||
|
for article in f.readlines():
|
||
|
queue(article)
|
||
|
|
||
|
|
||
|
def alexa():
|
||
|
"""
|
||
|
Opens the Alexa data file, which contains the 1000 top domain names on the
|
||
|
internet, and queues every unique string it contains.
|
||
|
|
||
|
Source: http://s3.amazonaws.com/alexa-static/top-1m.csv.zip
|
||
|
Retrieved: 18/04/2016
|
||
|
"""
|
||
|
with open(path.join(CWD, 'data', 'alexa.txt'), 'r') as f:
|
||
|
for domain in f.readlines():
|
||
|
base = re.match(r'([^\.]+)', domain)
|
||
|
if base.groups():
|
||
|
queue(base.group(0))
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
wikipedia()
|
||
|
alexa()
|
||
|
stdout.write('{:d} items queued.\n'.format(len(QUEUE)))
|