Added eventlet example: recursive webcrawler

2010-11-30 16:04:36 +01:00 · 2010-11-30 16:04:36 +01:00 · 6f252d0da0
--- a/examples/eventlet/celeryconfig.py
+++ b/examples/eventlet/celeryconfig.py
@ -12,4 +12,4 @@ CELERY_DISABLE_RATE_LIMITS = True
 CELERY_RESULT_BACKEND = "amqp"
 CELERY_TASK_RESULT_EXPIRES = 30 * 60

-CELERY_IMPORTS = ("tasks", )
+CELERY_IMPORTS = ("tasks", "webcrawler")
--- a/examples/eventlet/webcrawler.py
+++ b/examples/eventlet/webcrawler.py
@ -0,0 +1,38 @@
+"""Recursive webcrawler example.
+
+One problem with this solution is that it does not remember
+urls it has already seen.
+
+To add support for this a bloom filter or redis sets can be used.
+
+"""
+
+from __future__ import with_statement
+
+import re
+import time
+import urlparse
+
+from celery.decorators import task
+from eventlet import Timeout
+from eventlet.green import urllib2
+
+# http://daringfireball.net/2009/11/liberal_regex_for_matching_urls
+url_regex = re.compile(r'\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/)))')
+
+def domain(url):
+    return urlparse.urlsplit(url)[1].split(":")[0]
+
+@task
+def crawl(url):
+    print("crawling: %r" % (url, ))
+    location = domain(url)
+    data = ''
+    with Timeout(5, False):
+        data = urllib2.urlopen(url).read()
+    for url_match in url_regex.finditer(data):
+        new_url = url_match.group(0)
+        # Don't destroy the internet
+        if location in domain(new_url):
+            crawl.delay(new_url)
+            time.sleep(0.3)