Added eventlet example: recursive webcrawler

This commit is contained in:
Ask Solem 2010-11-30 16:04:36 +01:00
Родитель e6451a6117
Коммит 6f252d0da0
2 изменённых файлов: 39 добавлений и 1 удалений

Просмотреть файл

@ -12,4 +12,4 @@ CELERY_DISABLE_RATE_LIMITS = True
CELERY_RESULT_BACKEND = "amqp"
CELERY_TASK_RESULT_EXPIRES = 30 * 60
CELERY_IMPORTS = ("tasks", )
CELERY_IMPORTS = ("tasks", "webcrawler")

Просмотреть файл

@ -0,0 +1,38 @@
"""Recursive webcrawler example.
One problem with this solution is that it does not remember
urls it has already seen.
To add support for this a bloom filter or redis sets can be used.
"""
from __future__ import with_statement
import re
import time
import urlparse
from celery.decorators import task
from eventlet import Timeout
from eventlet.green import urllib2
# http://daringfireball.net/2009/11/liberal_regex_for_matching_urls
url_regex = re.compile(r'\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/)))')
def domain(url):
return urlparse.urlsplit(url)[1].split(":")[0]
@task
def crawl(url):
print("crawling: %r" % (url, ))
location = domain(url)
data = ''
with Timeout(5, False):
data = urllib2.urlopen(url).read()
for url_match in url_regex.finditer(data):
new_url = url_match.group(0)
# Don't destroy the internet
if location in domain(new_url):
crawl.delay(new_url)
time.sleep(0.3)