From fa8acd21e1e069be6175d10f4dc893fa823ac403 Mon Sep 17 00:00:00 2001 From: erickson Date: Wed, 23 Feb 2011 16:31:06 +0000 Subject: [PATCH] improved url deconstruction; ignore JS and anchor links git-svn-id: svn://svn.open-ils.org/ILS-Contrib/constrictor/trunk@1236 6d9bc8c9-1ec2-4278-b937-99fde70a366f --- samples/web_spider.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/samples/web_spider.py b/samples/web_spider.py index 4eb5ef264..0c82b4270 100755 --- a/samples/web_spider.py +++ b/samples/web_spider.py @@ -63,19 +63,25 @@ class WebSpiderScript(Script): return proto, host, path - def handle_starttag(self, tag, attrs): if tag == 'a' and attrs: - link = attrs[0][1] + + link = [h for h in attrs if h[0] == 'href'] + if len(link) == 0: return + link = link[0][1] if link[:4] != "http": proto, host, path = self.url_parts(self.url) + # Ignore href=javascript:foo and page anchors + if link[:11] == 'javascript:' or link[:1] == '#': + return + if link[:1] == '/': # full path - link = "%s://%s%s" % (proto, host, link) + path = link elif link[:1] == '?': # GET params only res = re.match('(.*)\?.*', path) -- 2.11.0