universal-search-recommenda.../prepopulate.py

#!/usr/bin/env python

import re
from os import path
from sys import stdout

from recommendation.tasks.task_recommend import recommend


CWD = path.dirname(path.realpath(__file__))
MAX_LENGTH = 10
QUEUE = []


def explode(string):
    """
    Returns an iterator that yields every possible substring of that string,
    starting with length 1 and ending with length MAX_LENGTH.

    Example:
    >>> list(explode('Firefox'))
    ['F', 'Fi', 'Fir', 'Fire', 'Firef', 'Firefo', 'Firefox']
    >>> list(explode('Mozilla Firefox'))
    ['M', 'Mo', 'Moz', 'Mozi', 'Mozil', 'Mozill', 'Mozilla', 'Mozilla ',
     'Mozilla F', 'Mozilla Fi']
    """
    n = MAX_LENGTH if len(string) > MAX_LENGTH else len(string)
    for i in range(n):
        yield string[:(i + 1)]


def queue(query):
    """
    Explodes a string, then queues a task to generate a recommendation for each
    item yielded by that string unless this script has already done so.
    """
    for q in explode(query):
        if q and q not in QUEUE:
            QUEUE.append(q)
            recommend.delay(q)


def wikipedia():
    """
    Opens the Wikipedia data file, which contains the names of the 250 most
    popular articles, and queues every unique string it contains.

    Source: https://en.wikipedia.org/wiki/User:West.andrew.g/Popular_pages
    Retrieved: 18/04/2016
    """
    with open(path.join(CWD, 'data', 'wikipedia.txt'), 'r') as f:
        for article in f.readlines():
            queue(article)


def alexa():
    """
    Opens the Alexa data file, which contains the 1000 top domain names on the
    internet, and queues every unique string it contains.

    Source: http://s3.amazonaws.com/alexa-static/top-1m.csv.zip
    Retrieved: 18/04/2016
    """
    with open(path.join(CWD, 'data', 'alexa.txt'), 'r') as f:
        for domain in f.readlines():
            base = re.match(r'([^\.]+)', domain)
            if base.groups():
                queue(base.group(0))


if __name__ == '__main__':
    wikipedia()
    alexa()
    stdout.write('{:d} items queued.\n'.format(len(QUEUE)))
Adds utility script to prepopulate cache (closes #28). 2016-04-19 01:29:50 +03:00			`#!/usr/bin/env python`

			`import re`
			`from os import path`
			`from sys import stdout`

			`from recommendation.tasks.task_recommend import recommend`


			`CWD = path.dirname(path.realpath(__file__))`
			`MAX_LENGTH = 10`
			`QUEUE = []`


			`def explode(string):`
			`"""`
			`Returns an iterator that yields every possible substring of that string,`
			`starting with length 1 and ending with length MAX_LENGTH.`

			`Example:`
			`>>> list(explode('Firefox'))`
			`['F', 'Fi', 'Fir', 'Fire', 'Firef', 'Firefo', 'Firefox']`
			`>>> list(explode('Mozilla Firefox'))`
			`['M', 'Mo', 'Moz', 'Mozi', 'Mozil', 'Mozill', 'Mozilla', 'Mozilla ',`
			`'Mozilla F', 'Mozilla Fi']`
			`"""`
			`n = MAX_LENGTH if len(string) > MAX_LENGTH else len(string)`
			`for i in range(n):`
			`yield string[:(i + 1)]`


			`def queue(query):`
			`"""`
			`Explodes a string, then queues a task to generate a recommendation for each`
			`item yielded by that string unless this script has already done so.`
			`"""`
			`for q in explode(query):`
			`if q and q not in QUEUE:`
			`QUEUE.append(q)`
			`recommend.delay(q)`


			`def wikipedia():`
			`"""`
			`Opens the Wikipedia data file, which contains the names of the 250 most`
			`popular articles, and queues every unique string it contains.`

			`Source: https://en.wikipedia.org/wiki/User:West.andrew.g/Popular_pages`
			`Retrieved: 18/04/2016`
			`"""`
			`with open(path.join(CWD, 'data', 'wikipedia.txt'), 'r') as f:`
			`for article in f.readlines():`
			`queue(article)`


			`def alexa():`
			`"""`
			`Opens the Alexa data file, which contains the 1000 top domain names on the`
			`internet, and queues every unique string it contains.`

			`Source: http://s3.amazonaws.com/alexa-static/top-1m.csv.zip`
			`Retrieved: 18/04/2016`
			`"""`
			`with open(path.join(CWD, 'data', 'alexa.txt'), 'r') as f:`
			`for domain in f.readlines():`
			`base = re.match(r'([^\.]+)', domain)`
			`if base.groups():`
			`queue(base.group(0))`


			`if __name__ == '__main__':`
			`wikipedia()`
			`alexa()`
			`stdout.write('{:d} items queued.\n'.format(len(QUEUE)))`