class Spider(HTMLParser):
- def __init__(self, url, max_visits, limit_path='', allowed_hosts=[]):
+ def __init__(self, url, max_visits, limit_paths, allowed_hosts=[]):
HTMLParser.__init__(self)
self.url = url
self.max_visits = max_visits
self.allowed_hosts = allowed_hosts
proto, self.host, path = self.url_parts(url)
- self.limit_path = limit_path
+ self.limit_paths = limit_paths
try:
foo = self.allowed_hosts.index(self.host)
log_info("Skipping remote host %s..." % host)
continue
- if self.limit_path:
- if path[:len(self.limit_path)] != self.limit_path:
- log_info("Skipping forbidden base path %s..." % path)
- continue
+ valid = False;
+ for lpath in self.limit_paths:
+ if path[:len(lpath)] == lpath:
+ valid = True
+ break
+
+ if not valid:
+ log_info("Skipping forbidden base path %s..." % path)
+ continue
try:
log_info("Opening URL %s" % self.url)
props = Properties.get_properties()
start_url = props.get_thread_prop('constrictor.plugin.web_spider.start_url')
max_pages = props.get_property('constrictor.plugin.web_spider.max_pages')
- limit_path = props.get_property('constrictor.plugin.web_spider.limit_path')
+ limit_paths = props.get_property('constrictor.plugin.web_spider.limit_paths')
+ limit_paths = limit_paths.split(',')
if not start_url or not max_pages:
log_error("Missing required properties: " +
"constrictor.plugin.web_spider.start_url, constrictor.plugin.web_spider.max_pages")
return False
- spider = WebSpiderScript.Spider(start_url, int(max_pages), limit_path)
+ spider = WebSpiderScript.Spider(start_url, int(max_pages), limit_paths)
result = spider.crawl()
return True