- added property name sorting to the store() method
"""
-import sys,os
-import re
-import time
+import sys,os, re, time
class IllegalArgumentException(Exception):
except KeyError:
if hasattr(self._props,name):
return getattr(self._props, name)
+
+ def get_thread_prop(self, prop, unique=False):
+ from constrictor.script import ScriptThread
+
+ data = self.get_property(prop)
+ data = data.split(',')
+
+ currentThread = ScriptThread.get_thread_id()
+ totalThreads = self.get_property('constrictor.numThreads')
+
+ if len(data) > currentThread:
+ return data[currentThread]
+
+ if unique:
+ raise Exception(
+ "Too many threads for unique data. Thread index is %d, size of dataset is %d" % (
+ currentThread, len(data)))
+
+ # data sharing is OK
+ return data[currentThread % len(data)]
+
if __name__=="__main__":
p = Properties()
--- /dev/null
+#!/usr/bin/python
+# --------------------------------------------------------------
+# Simple script sample. Eacch task just sleeps for some portion
+# of a second
+# --------------------------------------------------------------
+
+import random, time, sys, re
+import urllib2
+from HTMLParser import HTMLParser
+from constrictor.task import Task
+from constrictor.script import Script, ScriptManager
+from constrictor.properties import Properties
+from constrictor.log import *
+
+
+class PageFetchTask(Task):
+ def __init__(self, spider, name=None):
+ Task.__init__(self, name)
+ self.spider = spider
+
+ def run(self):
+ # fetch a single page
+ return self.spider.fetch_url()
+
+class WebSpiderScript(Script):
+
+ # Heavily modified version of the script found at
+ # http://www.halotis.com/2009/09/16/python-web-crawler-script/
+
+ class Spider(HTMLParser):
+
+ def __init__(self, url, max_visits, limit_path='', allowed_hosts=[]):
+
+ HTMLParser.__init__(self)
+ self.url = url
+ self.db = {self.url: 1}
+ self.url_list = [self.url]
+ self.max_visits = max_visits
+ self.allowed_hosts = allowed_hosts
+ proto, self.host, path = self.url_parts(url)
+ self.limit_path = limit_path
+
+ try:
+ foo = self.allowed_hosts.index(self.host)
+ except ValueError:
+ self.allowed_hosts.append(self.host)
+
+ def url_parts(self, url):
+ proto = ''
+ host = ''
+ path = ''
+
+ res = re.search('^(https?)://([^\/]+)(.*)', url)
+ try:
+ proto = res.group(1)
+ host = res.group(2)
+ except IndexError:
+ raise Exception("Invalid URL: %s" % url)
+ try:
+ path = res.group(3)
+ except IndexError:
+ pass
+
+ return proto, host, path
+
+
+
+ def handle_starttag(self, tag, attrs):
+
+ if tag == 'a' and attrs:
+
+ link = attrs[0][1]
+
+ if link[:4] != "http":
+ proto, host, path = self.url_parts(self.url)
+
+ if link[:1] == '/': # full path
+ link = "%s://%s%s" % (proto, host, link)
+
+ elif link[:1] == '?': # GET params only
+ res = re.match('(.*)\?.*', path)
+ path = "%s%s" % (res.group(1), link)
+
+ else: # relative path
+ parts = path.split('/')
+ path = path.replace(parts[-1:][0], link)
+
+ link = "%s://%s%s" % (proto, host, path)
+
+ if link not in self.db:
+ self.url_list.append(link)
+
+ self.db[link] = (self.db.get(link) or 0) + 1
+
+ def fetch_url(self):
+ req = urllib2.urlopen(self.url)
+ return req.read()
+
+ def crawl(self):
+
+ visited = 0
+
+ for self.url in self.url_list:
+
+ if visited > self.max_visits: break
+
+ log_debug("Visited %d URLs" % visited)
+
+ proto, host, path = self.url_parts(self.url)
+
+ try:
+ self.allowed_hosts.index(host)
+ except ValueError:
+ log_info("Skipping remote host %s..." % host)
+ continue
+
+ if self.limit_path:
+ if path[:len(self.limit_path)] != self.limit_path:
+ log_info("Skipping forbidden base path %s..." % path)
+ continue
+
+ try:
+ log_info("Opening URL %s" % self.url)
+ res = PageFetchTask(self).start()
+ self.reset()
+ self.feed(res)
+ visited += 1
+ except:
+ self.reset()
+
+ log_info("Found %d distinct URLs" % len(self.db.keys()))
+
+ def run(self):
+
+ props = Properties.get_properties()
+ start_url = props.get_thread_prop('constrictor.plugin.web_spider.start_url')
+ max_pages = props.get_property('constrictor.plugin.web_spider.max_pages')
+ limit_path = props.get_property('constrictor.plugin.web_spider.limit_path')
+
+ if not start_url or not max_pages:
+ log_error("Missing required properties: " +
+ "constrictor.plugin.web_spider.start_url, constrictor.plugin.web_spider.max_pages")
+ return False
+
+ spider = WebSpiderScript.Spider(start_url, int(max_pages), limit_path)
+ result = spider.crawl()
+
+ return True
+
+# Launch the script
+ScriptManager.go(WebSpiderScript())
+
+