From: erickson Date: Wed, 23 Feb 2011 16:31:06 +0000 (+0000) Subject: improved url deconstruction; ignore JS and anchor links X-Git-Url: https://old-git.evergreen-ils.org/?a=commitdiff_plain;h=fa8acd21e1e069be6175d10f4dc893fa823ac403;p=working%2Frandom.git improved url deconstruction; ignore JS and anchor links git-svn-id: svn://svn.open-ils.org/ILS-Contrib/constrictor/trunk@1236 6d9bc8c9-1ec2-4278-b937-99fde70a366f --- diff --git a/samples/web_spider.py b/samples/web_spider.py index 4eb5ef264..0c82b4270 100755 --- a/samples/web_spider.py +++ b/samples/web_spider.py @@ -63,19 +63,25 @@ class WebSpiderScript(Script): return proto, host, path - def handle_starttag(self, tag, attrs): if tag == 'a' and attrs: - link = attrs[0][1] + + link = [h for h in attrs if h[0] == 'href'] + if len(link) == 0: return + link = link[0][1] if link[:4] != "http": proto, host, path = self.url_parts(self.url) + # Ignore href=javascript:foo and page anchors + if link[:11] == 'javascript:' or link[:1] == '#': + return + if link[:1] == '/': # full path - link = "%s://%s%s" % (proto, host, link) + path = link elif link[:1] == '?': # GET params only res = re.match('(.*)\?.*', path)