allow for a list of limit_path options for tighter control on which resources to...

author erickson <erickson@6d9bc8c9-1ec2-4278-b937-99fde70a366f>

Thu, 21 Apr 2011 14:40:46 +0000 (14:40 +0000)

committer erickson <erickson@6d9bc8c9-1ec2-4278-b937-99fde70a366f>

Thu, 21 Apr 2011 14:40:46 +0000 (14:40 +0000)
author erickson <erickson@6d9bc8c9-1ec2-4278-b937-99fde70a366f>
Thu, 21 Apr 2011 14:40:46 +0000 (14:40 +0000)
committer erickson <erickson@6d9bc8c9-1ec2-4278-b937-99fde70a366f>
Thu, 21 Apr 2011 14:40:46 +0000 (14:40 +0000)
diff --git a/constrictor.properties b/constrictor.properties

index 378c9a4..54b24cc 100644 (file)
--- a/constrictor.properties
+++ b/constrictor.properties
@@ -42,7 +42,7 @@ constrictor.plugin.web_spider.start_url=http://example.org/somepath?foo=bar,http
  constrictor.plugin.web_spider.max_pages=100
  
  # Only allow the spider to fetch pages with a certain base path
-constrictor.plugin.web_spider.limit_path=/somepath
+constrictor.plugin.web_spider.limit_paths=/somepath,/otherpath
  
  
  
diff --git a/samples/web_spider.py b/samples/web_spider.py

index 0c82b42..5c82fa9 100755 (executable)
--- a/samples/web_spider.py
+++ b/samples/web_spider.py
@@ -29,7 +29,7 @@ class WebSpiderScript(Script):
  
      class Spider(HTMLParser): 
  
-        def __init__(self, url, max_visits, limit_path='', allowed_hosts=[]): 
+        def __init__(self, url, max_visits, limit_paths, allowed_hosts=[]): 
  
              HTMLParser.__init__(self) 
              self.url = url
@@ -38,7 +38,7 @@ class WebSpiderScript(Script):
              self.max_visits = max_visits
              self.allowed_hosts = allowed_hosts
              proto, self.host, path = self.url_parts(url)
-            self.limit_path = limit_path
+            self.limit_paths = limit_paths
  
              try:
                  foo = self.allowed_hosts.index(self.host)
@@ -120,10 +120,15 @@ class WebSpiderScript(Script):
                      log_info("Skipping remote host %s..." % host)
                      continue
  
-                if self.limit_path:
-                    if path[:len(self.limit_path)] != self.limit_path:
-                        log_info("Skipping forbidden base path %s..." % path)
-                        continue
+                valid = False;
+                for lpath in self.limit_paths:
+                    if path[:len(lpath)] == lpath:
+                        valid = True
+                        break
+
+                if not valid:
+                    log_info("Skipping forbidden base path %s..." % path)
+                    continue
  
                  try: 
                      log_info("Opening URL %s" % self.url)              
@@ -141,14 +146,15 @@ class WebSpiderScript(Script):
          props = Properties.get_properties()
          start_url = props.get_thread_prop('constrictor.plugin.web_spider.start_url')
          max_pages = props.get_property('constrictor.plugin.web_spider.max_pages')
-        limit_path = props.get_property('constrictor.plugin.web_spider.limit_path')
+        limit_paths = props.get_property('constrictor.plugin.web_spider.limit_paths')
+        limit_paths = limit_paths.split(',')
  
          if not start_url or not max_pages:
              log_error("Missing required properties: " +
                  "constrictor.plugin.web_spider.start_url, constrictor.plugin.web_spider.max_pages")
              return False
          
-        spider = WebSpiderScript.Spider(start_url, int(max_pages), limit_path)
+        spider = WebSpiderScript.Spider(start_url, int(max_pages), limit_paths)
          result = spider.crawl() 
  
          return True
author	erickson <erickson@6d9bc8c9-1ec2-4278-b937-99fde70a366f>
	Thu, 21 Apr 2011 14:40:46 +0000 (14:40 +0000)
committer	erickson <erickson@6d9bc8c9-1ec2-4278-b937-99fde70a366f>
	Thu, 21 Apr 2011 14:40:46 +0000 (14:40 +0000)
constrictor.properties		patch \| blob \| history
samples/web_spider.py		patch \| blob \| history