improved url deconstruction; ignore JS and anchor links
authorerickson <erickson@6d9bc8c9-1ec2-4278-b937-99fde70a366f>
Wed, 23 Feb 2011 16:31:06 +0000 (16:31 +0000)
committererickson <erickson@6d9bc8c9-1ec2-4278-b937-99fde70a366f>
Wed, 23 Feb 2011 16:31:06 +0000 (16:31 +0000)
git-svn-id: svn://svn.open-ils.org/ILS-Contrib/constrictor/trunk@1236 6d9bc8c9-1ec2-4278-b937-99fde70a366f

samples/web_spider.py

index 4eb5ef2..0c82b42 100755 (executable)
@@ -63,19 +63,25 @@ class WebSpiderScript(Script):
 
             return proto, host, path
 
-           
             
         def handle_starttag(self, tag, attrs): 
 
             if tag == 'a' and attrs: 
 
-                link = attrs[0][1] 
+
+                link = [h for h in attrs if h[0] == 'href']
+                if len(link) == 0: return
+                link = link[0][1] 
 
                 if link[:4] != "http": 
                     proto, host, path = self.url_parts(self.url)
 
+                    # Ignore href=javascript:foo and page anchors
+                    if link[:11] == 'javascript:' or link[:1] == '#':
+                        return
+
                     if link[:1] == '/': # full path
-                        link = "%s://%s%s" % (proto, host, link)
+                        path = link
 
                     elif link[:1] == '?': # GET params only
                         res = re.match('(.*)\?.*', path)