From 4303c3dba72eaa87dd92af622540fbd4f493441d Mon Sep 17 00:00:00 2001
From: erickson <erickson@6d9bc8c9-1ec2-4278-b937-99fde70a366f>
Date: Sat, 29 Jan 2011 16:06:52 +0000
Subject: [PATCH] Web spider sample script

Simple web spider script that visits, reports, parses, collects links,
and continues until it has fetched a configured number of pages.  Each page
load is a constrictor Task so timing data can be collected.

git-svn-id: svn://svn.open-ils.org/ILS-Contrib/constrictor/trunk@1215 6d9bc8c9-1ec2-4278-b937-99fde70a366f
---
 constrictor.properties    |  12 ++++
 constrictor/data.py       |   1 -
 constrictor/log.py        |   4 +-
 constrictor/properties.py |  25 +++++++-
 constrictor/task.py       |   4 +-
 samples/web_spider.py     | 153 ++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 192 insertions(+), 7 deletions(-)
 create mode 100755 samples/web_spider.py

diff --git a/constrictor.properties b/constrictor.properties
index d2ffa3d82..378c9a46a 100644
--- a/constrictor.properties
+++ b/constrictor.properties
@@ -33,6 +33,18 @@ constrictor.port=21800
 #logs to stdout and stderr.  options are 0=none,1=error,2=info,3=debug
 constrictor.loglevel=2
 
+# ---- Setings fro sample web spider plugin  --------------
+
+# Initial URL.  Can be different per thread w/ comma-separated list
+constrictor.plugin.web_spider.start_url=http://example.org/somepath?foo=bar,http://example.org/somepath?foo=bar2,http://example.org/somepath?foo=bar3
+
+# Each spider thread will stop crawling after fetching this many pages
+constrictor.plugin.web_spider.max_pages=100
+
+# Only allow the spider to fetch pages with a certain base path
+constrictor.plugin.web_spider.limit_path=/somepath
+
+
 
 
 
diff --git a/constrictor/data.py b/constrictor/data.py
index 802dadb90..16be41d2f 100644
--- a/constrictor/data.py
+++ b/constrictor/data.py
@@ -118,7 +118,6 @@ class Data(object):
         for task in self.runtime_data:
     
             task_times += task['duration']
-            log.log_debug("Storing " + task['name'])
 
             if task['name'] not in task_counts:
                 task_counts[task['name']] = 0;
diff --git a/constrictor/log.py b/constrictor/log.py
index a5ca65a75..c35059593 100644
--- a/constrictor/log.py
+++ b/constrictor/log.py
@@ -27,7 +27,7 @@ def init_log():
 def log_error(msg=''):
     if loglevel < 1: return
     from script import ScriptThread
-    sys.stderr.write('Error[%d]: %s\n' % (ScriptThread.get_thread_id(), msg))
+    sys.stderr.write('Err [%d]: %s\n' % (ScriptThread.get_thread_id(), msg))
     sys.stderr.flush()
 
 def log_info(msg=''):
@@ -38,4 +38,4 @@ def log_info(msg=''):
 def log_debug(msg=''):
     if loglevel < 3: return
     from script import ScriptThread
-    print 'Debug[%d]: %s' % (ScriptThread.get_thread_id(), msg)
+    print 'Debg[%d]: %s' % (ScriptThread.get_thread_id(), msg)
diff --git a/constrictor/properties.py b/constrictor/properties.py
index c3ac41a10..a306e9ff3 100644
--- a/constrictor/properties.py
+++ b/constrictor/properties.py
@@ -9,9 +9,7 @@ Edited by Bill Erickson <billserickson@gmail.com>
     - added property name sorting to the store() method
 """
 
-import sys,os
-import re
-import time
+import sys,os, re, time
 
 class IllegalArgumentException(Exception):
 
@@ -318,6 +316,27 @@ class Properties(object):
         except KeyError:
             if hasattr(self._props,name):
                 return getattr(self._props, name)
+
+    def get_thread_prop(self, prop, unique=False):
+        from constrictor.script import ScriptThread
+
+        data = self.get_property(prop)
+        data = data.split(',')
+
+        currentThread = ScriptThread.get_thread_id()
+        totalThreads = self.get_property('constrictor.numThreads')
+
+        if len(data) > currentThread:
+            return data[currentThread]
+
+        if unique:
+            raise Exception(
+                "Too many threads for unique data.  Thread index is %d, size of dataset is %d" % (
+                    currentThread, len(data)))
+        
+        # data sharing is OK  
+        return data[currentThread % len(data)]
+
             
 if __name__=="__main__":
     p = Properties()
diff --git a/constrictor/task.py b/constrictor/task.py
index e7a3276e5..e858ade71 100644
--- a/constrictor/task.py
+++ b/constrictor/task.py
@@ -29,7 +29,9 @@ class Task(object):
         on the actual Task object.
         """
 
-    def __init__(self, name=''):
+    def __init__(self, name=None):
+        if name is None:
+            name = self.__class__.__name__
         self.name = name
         self.reset()
 
diff --git a/samples/web_spider.py b/samples/web_spider.py
new file mode 100755
index 000000000..4eb5ef264
--- /dev/null
+++ b/samples/web_spider.py
@@ -0,0 +1,153 @@
+#!/usr/bin/python
+# --------------------------------------------------------------
+# Simple script sample.  Eacch task just sleeps for some portion 
+# of a second
+# --------------------------------------------------------------
+
+import random, time, sys, re
+import urllib2
+from HTMLParser import HTMLParser 
+from constrictor.task import Task
+from constrictor.script import Script, ScriptManager
+from constrictor.properties import Properties
+from constrictor.log import *
+
+
+class PageFetchTask(Task):
+    def __init__(self, spider, name=None):
+        Task.__init__(self, name)
+        self.spider = spider
+
+    def run(self):
+        # fetch a single page
+        return self.spider.fetch_url()
+
+class WebSpiderScript(Script):
+
+    # Heavily modified version of the script found at 
+    # http://www.halotis.com/2009/09/16/python-web-crawler-script/
+
+    class Spider(HTMLParser): 
+
+        def __init__(self, url, max_visits, limit_path='', allowed_hosts=[]): 
+
+            HTMLParser.__init__(self) 
+            self.url = url
+            self.db = {self.url: 1} 
+            self.url_list = [self.url]   
+            self.max_visits = max_visits
+            self.allowed_hosts = allowed_hosts
+            proto, self.host, path = self.url_parts(url)
+            self.limit_path = limit_path
+
+            try:
+                foo = self.allowed_hosts.index(self.host)
+            except ValueError:
+                self.allowed_hosts.append(self.host)
+
+        def url_parts(self, url):
+            proto = ''
+            host = ''
+            path = ''
+
+            res = re.search('^(https?)://([^\/]+)(.*)', url)
+            try:
+                proto = res.group(1)
+                host = res.group(2)
+            except IndexError:
+                raise Exception("Invalid URL: %s" % url) 
+            try:
+                path = res.group(3)
+            except IndexError:
+                pass
+
+            return proto, host, path
+
+           
+            
+        def handle_starttag(self, tag, attrs): 
+
+            if tag == 'a' and attrs: 
+
+                link = attrs[0][1] 
+
+                if link[:4] != "http": 
+                    proto, host, path = self.url_parts(self.url)
+
+                    if link[:1] == '/': # full path
+                        link = "%s://%s%s" % (proto, host, link)
+
+                    elif link[:1] == '?': # GET params only
+                        res = re.match('(.*)\?.*', path)
+                        path = "%s%s" % (res.group(1), link)
+
+                    else: # relative path
+                        parts = path.split('/')
+                        path = path.replace(parts[-1:][0], link)
+
+                    link = "%s://%s%s" % (proto, host, path)
+
+                if link not in self.db: 
+                    self.url_list.append(link) 
+
+                self.db[link] = (self.db.get(link) or 0) + 1   
+            
+        def fetch_url(self):
+            req = urllib2.urlopen(self.url) 
+            return req.read() 
+            
+        def crawl(self): 
+            
+            visited = 0
+
+            for self.url in self.url_list:
+
+                if visited > self.max_visits: break
+
+                log_debug("Visited %d URLs" % visited)
+
+                proto, host, path = self.url_parts(self.url)
+
+                try:
+                    self.allowed_hosts.index(host)
+                except ValueError:
+                    log_info("Skipping remote host %s..." % host)
+                    continue
+
+                if self.limit_path:
+                    if path[:len(self.limit_path)] != self.limit_path:
+                        log_info("Skipping forbidden base path %s..." % path)
+                        continue
+
+                try: 
+                    log_info("Opening URL %s" % self.url)              
+                    res = PageFetchTask(self).start()
+                    self.reset() 
+                    self.feed(res) 
+                    visited += 1
+                except: 
+                    self.reset() 
+            
+            log_info("Found %d distinct URLs" % len(self.db.keys()))
+
+    def run(self):
+
+        props = Properties.get_properties()
+        start_url = props.get_thread_prop('constrictor.plugin.web_spider.start_url')
+        max_pages = props.get_property('constrictor.plugin.web_spider.max_pages')
+        limit_path = props.get_property('constrictor.plugin.web_spider.limit_path')
+
+        if not start_url or not max_pages:
+            log_error("Missing required properties: " +
+                "constrictor.plugin.web_spider.start_url, constrictor.plugin.web_spider.max_pages")
+            return False
+        
+        spider = WebSpiderScript.Spider(start_url, int(max_pages), limit_path)
+        result = spider.crawl() 
+
+        return True
+
+# Launch the script
+ScriptManager.go(WebSpiderScript())
+
+
-- 
2.11.0