Web spider sample script

author erickson <erickson@6d9bc8c9-1ec2-4278-b937-99fde70a366f>

Sat, 29 Jan 2011 16:06:52 +0000 (16:06 +0000)

committer erickson <erickson@6d9bc8c9-1ec2-4278-b937-99fde70a366f>

Sat, 29 Jan 2011 16:06:52 +0000 (16:06 +0000)
author erickson <erickson@6d9bc8c9-1ec2-4278-b937-99fde70a366f>
Sat, 29 Jan 2011 16:06:52 +0000 (16:06 +0000)
committer erickson <erickson@6d9bc8c9-1ec2-4278-b937-99fde70a366f>
Sat, 29 Jan 2011 16:06:52 +0000 (16:06 +0000)
diff --git a/constrictor.properties b/constrictor.properties

index d2ffa3d..378c9a4 100644 (file)
--- a/constrictor.properties
+++ b/constrictor.properties
@@ -33,6 +33,18 @@ constrictor.port=21800
  #logs to stdout and stderr.  options are 0=none,1=error,2=info,3=debug
  constrictor.loglevel=2
  
+# ---- Setings fro sample web spider plugin  --------------
+
+# Initial URL.  Can be different per thread w/ comma-separated list
+constrictor.plugin.web_spider.start_url=http://example.org/somepath?foo=bar,http://example.org/somepath?foo=bar2,http://example.org/somepath?foo=bar3
+
+# Each spider thread will stop crawling after fetching this many pages
+constrictor.plugin.web_spider.max_pages=100
+
+# Only allow the spider to fetch pages with a certain base path
+constrictor.plugin.web_spider.limit_path=/somepath
+
+
  
  
  
diff --git a/constrictor/data.py b/constrictor/data.py

index 802dadb..16be41d 100644 (file)
--- a/constrictor/data.py
+++ b/constrictor/data.py
@@ -118,7 +118,6 @@ class Data(object):
          for task in self.runtime_data:
      
              task_times += task['duration']
-            log.log_debug("Storing " + task['name'])
  
              if task['name'] not in task_counts:
                  task_counts[task['name']] = 0;
diff --git a/constrictor/log.py b/constrictor/log.py

index a5ca65a..c350595 100644 (file)
--- a/constrictor/log.py
+++ b/constrictor/log.py
@@ -27,7 +27,7 @@ def init_log():
  def log_error(msg=''):
      if loglevel < 1: return
      from script import ScriptThread
-    sys.stderr.write('Error[%d]: %s\n' % (ScriptThread.get_thread_id(), msg))
+    sys.stderr.write('Err [%d]: %s\n' % (ScriptThread.get_thread_id(), msg))
      sys.stderr.flush()
  
  def log_info(msg=''):
@@ -38,4 +38,4 @@ def log_info(msg=''):
  def log_debug(msg=''):
      if loglevel < 3: return
      from script import ScriptThread
-    print 'Debug[%d]: %s' % (ScriptThread.get_thread_id(), msg)
+    print 'Debg[%d]: %s' % (ScriptThread.get_thread_id(), msg)
diff --git a/constrictor/properties.py b/constrictor/properties.py

index c3ac41a..a306e9f 100644 (file)
--- a/constrictor/properties.py
+++ b/constrictor/properties.py
@@ -9,9 +9,7 @@ Edited by Bill Erickson <billserickson@gmail.com>
      - added property name sorting to the store() method
  """
  
-import sys,os
-import re
-import time
+import sys,os, re, time
  
  class IllegalArgumentException(Exception):
  
@@ -318,6 +316,27 @@ class Properties(object):
          except KeyError:
              if hasattr(self._props,name):
                  return getattr(self._props, name)
+
+    def get_thread_prop(self, prop, unique=False):
+        from constrictor.script import ScriptThread
+
+        data = self.get_property(prop)
+        data = data.split(',')
+
+        currentThread = ScriptThread.get_thread_id()
+        totalThreads = self.get_property('constrictor.numThreads')
+
+        if len(data) > currentThread:
+            return data[currentThread]
+
+        if unique:
+            raise Exception(
+                "Too many threads for unique data.  Thread index is %d, size of dataset is %d" % (
+                    currentThread, len(data)))
+        
+        # data sharing is OK  
+        return data[currentThread % len(data)]
+
              
  if __name__=="__main__":
      p = Properties()
diff --git a/constrictor/task.py b/constrictor/task.py

index e7a3276..e858ade 100644 (file)
--- a/constrictor/task.py
+++ b/constrictor/task.py
@@ -29,7 +29,9 @@ class Task(object):
          on the actual Task object.
          """
  
-    def __init__(self, name=''):
+    def __init__(self, name=None):
+        if name is None:
+            name = self.__class__.__name__
          self.name = name
          self.reset()
  
diff --git a/samples/web_spider.py b/samples/web_spider.py

new file mode 100755 (executable)

index 0000000..4eb5ef2
--- /dev/null
+++ b/samples/web_spider.py
@@ -0,0 +1,153 @@
+#!/usr/bin/python
+# --------------------------------------------------------------
+# Simple script sample.  Eacch task just sleeps for some portion 
+# of a second
+# --------------------------------------------------------------
+
+import random, time, sys, re
+import urllib2
+from HTMLParser import HTMLParser 
+from constrictor.task import Task
+from constrictor.script import Script, ScriptManager
+from constrictor.properties import Properties
+from constrictor.log import *
+
+
+class PageFetchTask(Task):
+    def __init__(self, spider, name=None):
+        Task.__init__(self, name)
+        self.spider = spider
+
+    def run(self):
+        # fetch a single page
+        return self.spider.fetch_url()
+
+class WebSpiderScript(Script):
+
+    # Heavily modified version of the script found at 
+    # http://www.halotis.com/2009/09/16/python-web-crawler-script/
+
+    class Spider(HTMLParser): 
+
+        def __init__(self, url, max_visits, limit_path='', allowed_hosts=[]): 
+
+            HTMLParser.__init__(self) 
+            self.url = url
+            self.db = {self.url: 1} 
+            self.url_list = [self.url]   
+            self.max_visits = max_visits
+            self.allowed_hosts = allowed_hosts
+            proto, self.host, path = self.url_parts(url)
+            self.limit_path = limit_path
+
+            try:
+                foo = self.allowed_hosts.index(self.host)
+            except ValueError:
+                self.allowed_hosts.append(self.host)
+
+        def url_parts(self, url):
+            proto = ''
+            host = ''
+            path = ''
+
+            res = re.search('^(https?)://([^\/]+)(.*)', url)
+            try:
+                proto = res.group(1)
+                host = res.group(2)
+            except IndexError:
+                raise Exception("Invalid URL: %s" % url) 
+            try:
+                path = res.group(3)
+            except IndexError:
+                pass
+
+            return proto, host, path
+
+           
+            
+        def handle_starttag(self, tag, attrs): 
+
+            if tag == 'a' and attrs: 
+
+                link = attrs[0][1] 
+
+                if link[:4] != "http": 
+                    proto, host, path = self.url_parts(self.url)
+
+                    if link[:1] == '/': # full path
+                        link = "%s://%s%s" % (proto, host, link)
+
+                    elif link[:1] == '?': # GET params only
+                        res = re.match('(.*)\?.*', path)
+                        path = "%s%s" % (res.group(1), link)
+
+                    else: # relative path
+                        parts = path.split('/')
+                        path = path.replace(parts[-1:][0], link)
+
+                    link = "%s://%s%s" % (proto, host, path)
+
+                if link not in self.db: 
+                    self.url_list.append(link) 
+
+                self.db[link] = (self.db.get(link) or 0) + 1   
+            
+        def fetch_url(self):
+            req = urllib2.urlopen(self.url) 
+            return req.read() 
+            
+        def crawl(self): 
+            
+            visited = 0
+
+            for self.url in self.url_list:
+
+                if visited > self.max_visits: break
+
+                log_debug("Visited %d URLs" % visited)
+
+                proto, host, path = self.url_parts(self.url)
+
+                try:
+                    self.allowed_hosts.index(host)
+                except ValueError:
+                    log_info("Skipping remote host %s..." % host)
+                    continue
+
+                if self.limit_path:
+                    if path[:len(self.limit_path)] != self.limit_path:
+                        log_info("Skipping forbidden base path %s..." % path)
+                        continue
+
+                try: 
+                    log_info("Opening URL %s" % self.url)              
+                    res = PageFetchTask(self).start()
+                    self.reset() 
+                    self.feed(res) 
+                    visited += 1
+                except: 
+                    self.reset() 
+            
+            log_info("Found %d distinct URLs" % len(self.db.keys()))
+
+    def run(self):
+
+        props = Properties.get_properties()
+        start_url = props.get_thread_prop('constrictor.plugin.web_spider.start_url')
+        max_pages = props.get_property('constrictor.plugin.web_spider.max_pages')
+        limit_path = props.get_property('constrictor.plugin.web_spider.limit_path')
+
+        if not start_url or not max_pages:
+            log_error("Missing required properties: " +
+                "constrictor.plugin.web_spider.start_url, constrictor.plugin.web_spider.max_pages")
+            return False
+        
+        spider = WebSpiderScript.Spider(start_url, int(max_pages), limit_path)
+        result = spider.crawl() 
+
+        return True
+
+# Launch the script
+ScriptManager.go(WebSpiderScript())
+
+
author	erickson <erickson@6d9bc8c9-1ec2-4278-b937-99fde70a366f>
	Sat, 29 Jan 2011 16:06:52 +0000 (16:06 +0000)
committer	erickson <erickson@6d9bc8c9-1ec2-4278-b937-99fde70a366f>
	Sat, 29 Jan 2011 16:06:52 +0000 (16:06 +0000)
constrictor.properties		patch \| blob \| history
constrictor/data.py		patch \| blob \| history
constrictor/log.py		patch \| blob \| history
constrictor/properties.py		patch \| blob \| history
constrictor/task.py		patch \| blob \| history
samples/web_spider.py	[new file with mode: 0755]	patch \| blob