universal-search-recommenda.../prepopulate.py

#!/usr/bin/env python

import re
from os import path
from sys import stdout

from recommendation.tasks.task_recommend import recommend


CWD = path.dirname(path.realpath(__file__))
MAX_LENGTH = 10
QUEUE = []


def explode(string):
    """
    Returns an iterator that yields every possible substring of that string,
    starting with length 1 and ending with length MAX_LENGTH.

    Example:
    >>> list(explode('Firefox'))
    ['F', 'Fi', 'Fir', 'Fire', 'Firef', 'Firefo', 'Firefox']
    >>> list(explode('Mozilla Firefox'))
    ['M', 'Mo', 'Moz', 'Mozi', 'Mozil', 'Mozill', 'Mozilla', 'Mozilla ',
     'Mozilla F', 'Mozilla Fi']
    """
    n = MAX_LENGTH if len(string) > MAX_LENGTH else len(string)
    for i in range(n):
        yield string[:(i + 1)]


def queue(query):
    """
    Explodes a string, then queues a task to generate a recommendation for each
    item yielded by that string unless this script has already done so.
    """
    for q in explode(query):
        if q and q not in QUEUE:
            QUEUE.append(q)
            recommend.delay(q)


def wikipedia():
    """
    Opens the Wikipedia data file, which contains the names of the 250 most
    popular articles, and queues every unique string it contains.

    Source: https://en.wikipedia.org/wiki/User:West.andrew.g/Popular_pages
    Retrieved: 18/04/2016
    """
    with open(path.join(CWD, 'data', 'wikipedia.txt'), 'r') as f:
        for article in f.readlines():
            queue(article)


def alexa():
    """
    Opens the Alexa data file, which contains the 1000 top domain names on the
    internet, and queues every unique string it contains.

    Source: http://s3.amazonaws.com/alexa-static/top-1m.csv.zip
    Retrieved: 18/04/2016
    """
    with open(path.join(CWD, 'data', 'alexa.txt'), 'r') as f:
        for domain in f.readlines():
            base = re.match(r'([^\.]+)', domain)
            if base.groups():
                queue(base.group(0))


if __name__ == '__main__':
    wikipedia()
    alexa()
    stdout.write('{:d} items queued.\n'.format(len(QUEUE)))