From 4303c3dba72eaa87dd92af622540fbd4f493441d Mon Sep 17 00:00:00 2001 From: erickson Date: Sat, 29 Jan 2011 16:06:52 +0000 Subject: [PATCH] Web spider sample script Simple web spider script that visits, reports, parses, collects links, and continues until it has fetched a configured number of pages. Each page load is a constrictor Task so timing data can be collected. git-svn-id: svn://svn.open-ils.org/ILS-Contrib/constrictor/trunk@1215 6d9bc8c9-1ec2-4278-b937-99fde70a366f --- constrictor.properties | 12 ++++ constrictor/data.py | 1 - constrictor/log.py | 4 +- constrictor/properties.py | 25 +++++++- constrictor/task.py | 4 +- samples/web_spider.py | 153 ++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 192 insertions(+), 7 deletions(-) create mode 100755 samples/web_spider.py diff --git a/constrictor.properties b/constrictor.properties index d2ffa3d82..378c9a46a 100644 --- a/constrictor.properties +++ b/constrictor.properties @@ -33,6 +33,18 @@ constrictor.port=21800 #logs to stdout and stderr. options are 0=none,1=error,2=info,3=debug constrictor.loglevel=2 +# ---- Setings fro sample web spider plugin -------------- + +# Initial URL. Can be different per thread w/ comma-separated list +constrictor.plugin.web_spider.start_url=http://example.org/somepath?foo=bar,http://example.org/somepath?foo=bar2,http://example.org/somepath?foo=bar3 + +# Each spider thread will stop crawling after fetching this many pages +constrictor.plugin.web_spider.max_pages=100 + +# Only allow the spider to fetch pages with a certain base path +constrictor.plugin.web_spider.limit_path=/somepath + + diff --git a/constrictor/data.py b/constrictor/data.py index 802dadb90..16be41d2f 100644 --- a/constrictor/data.py +++ b/constrictor/data.py @@ -118,7 +118,6 @@ class Data(object): for task in self.runtime_data: task_times += task['duration'] - log.log_debug("Storing " + task['name']) if task['name'] not in task_counts: task_counts[task['name']] = 0; diff --git a/constrictor/log.py b/constrictor/log.py index a5ca65a75..c35059593 100644 --- a/constrictor/log.py +++ b/constrictor/log.py @@ -27,7 +27,7 @@ def init_log(): def log_error(msg=''): if loglevel < 1: return from script import ScriptThread - sys.stderr.write('Error[%d]: %s\n' % (ScriptThread.get_thread_id(), msg)) + sys.stderr.write('Err [%d]: %s\n' % (ScriptThread.get_thread_id(), msg)) sys.stderr.flush() def log_info(msg=''): @@ -38,4 +38,4 @@ def log_info(msg=''): def log_debug(msg=''): if loglevel < 3: return from script import ScriptThread - print 'Debug[%d]: %s' % (ScriptThread.get_thread_id(), msg) + print 'Debg[%d]: %s' % (ScriptThread.get_thread_id(), msg) diff --git a/constrictor/properties.py b/constrictor/properties.py index c3ac41a10..a306e9ff3 100644 --- a/constrictor/properties.py +++ b/constrictor/properties.py @@ -9,9 +9,7 @@ Edited by Bill Erickson - added property name sorting to the store() method """ -import sys,os -import re -import time +import sys,os, re, time class IllegalArgumentException(Exception): @@ -318,6 +316,27 @@ class Properties(object): except KeyError: if hasattr(self._props,name): return getattr(self._props, name) + + def get_thread_prop(self, prop, unique=False): + from constrictor.script import ScriptThread + + data = self.get_property(prop) + data = data.split(',') + + currentThread = ScriptThread.get_thread_id() + totalThreads = self.get_property('constrictor.numThreads') + + if len(data) > currentThread: + return data[currentThread] + + if unique: + raise Exception( + "Too many threads for unique data. Thread index is %d, size of dataset is %d" % ( + currentThread, len(data))) + + # data sharing is OK + return data[currentThread % len(data)] + if __name__=="__main__": p = Properties() diff --git a/constrictor/task.py b/constrictor/task.py index e7a3276e5..e858ade71 100644 --- a/constrictor/task.py +++ b/constrictor/task.py @@ -29,7 +29,9 @@ class Task(object): on the actual Task object. """ - def __init__(self, name=''): + def __init__(self, name=None): + if name is None: + name = self.__class__.__name__ self.name = name self.reset() diff --git a/samples/web_spider.py b/samples/web_spider.py new file mode 100755 index 000000000..4eb5ef264 --- /dev/null +++ b/samples/web_spider.py @@ -0,0 +1,153 @@ +#!/usr/bin/python +# -------------------------------------------------------------- +# Simple script sample. Eacch task just sleeps for some portion +# of a second +# -------------------------------------------------------------- + +import random, time, sys, re +import urllib2 +from HTMLParser import HTMLParser +from constrictor.task import Task +from constrictor.script import Script, ScriptManager +from constrictor.properties import Properties +from constrictor.log import * + + +class PageFetchTask(Task): + def __init__(self, spider, name=None): + Task.__init__(self, name) + self.spider = spider + + def run(self): + # fetch a single page + return self.spider.fetch_url() + +class WebSpiderScript(Script): + + # Heavily modified version of the script found at + # http://www.halotis.com/2009/09/16/python-web-crawler-script/ + + class Spider(HTMLParser): + + def __init__(self, url, max_visits, limit_path='', allowed_hosts=[]): + + HTMLParser.__init__(self) + self.url = url + self.db = {self.url: 1} + self.url_list = [self.url] + self.max_visits = max_visits + self.allowed_hosts = allowed_hosts + proto, self.host, path = self.url_parts(url) + self.limit_path = limit_path + + try: + foo = self.allowed_hosts.index(self.host) + except ValueError: + self.allowed_hosts.append(self.host) + + def url_parts(self, url): + proto = '' + host = '' + path = '' + + res = re.search('^(https?)://([^\/]+)(.*)', url) + try: + proto = res.group(1) + host = res.group(2) + except IndexError: + raise Exception("Invalid URL: %s" % url) + try: + path = res.group(3) + except IndexError: + pass + + return proto, host, path + + + + def handle_starttag(self, tag, attrs): + + if tag == 'a' and attrs: + + link = attrs[0][1] + + if link[:4] != "http": + proto, host, path = self.url_parts(self.url) + + if link[:1] == '/': # full path + link = "%s://%s%s" % (proto, host, link) + + elif link[:1] == '?': # GET params only + res = re.match('(.*)\?.*', path) + path = "%s%s" % (res.group(1), link) + + else: # relative path + parts = path.split('/') + path = path.replace(parts[-1:][0], link) + + link = "%s://%s%s" % (proto, host, path) + + if link not in self.db: + self.url_list.append(link) + + self.db[link] = (self.db.get(link) or 0) + 1 + + def fetch_url(self): + req = urllib2.urlopen(self.url) + return req.read() + + def crawl(self): + + visited = 0 + + for self.url in self.url_list: + + if visited > self.max_visits: break + + log_debug("Visited %d URLs" % visited) + + proto, host, path = self.url_parts(self.url) + + try: + self.allowed_hosts.index(host) + except ValueError: + log_info("Skipping remote host %s..." % host) + continue + + if self.limit_path: + if path[:len(self.limit_path)] != self.limit_path: + log_info("Skipping forbidden base path %s..." % path) + continue + + try: + log_info("Opening URL %s" % self.url) + res = PageFetchTask(self).start() + self.reset() + self.feed(res) + visited += 1 + except: + self.reset() + + log_info("Found %d distinct URLs" % len(self.db.keys())) + + def run(self): + + props = Properties.get_properties() + start_url = props.get_thread_prop('constrictor.plugin.web_spider.start_url') + max_pages = props.get_property('constrictor.plugin.web_spider.max_pages') + limit_path = props.get_property('constrictor.plugin.web_spider.limit_path') + + if not start_url or not max_pages: + log_error("Missing required properties: " + + "constrictor.plugin.web_spider.start_url, constrictor.plugin.web_spider.max_pages") + return False + + spider = WebSpiderScript.Spider(start_url, int(max_pages), limit_path) + result = spider.crawl() + + return True + +# Launch the script +ScriptManager.go(WebSpiderScript()) + + -- 2.11.0