improved url deconstruction; ignore JS and anchor links

author erickson <erickson@6d9bc8c9-1ec2-4278-b937-99fde70a366f>

Wed, 23 Feb 2011 16:31:06 +0000 (16:31 +0000)

committer erickson <erickson@6d9bc8c9-1ec2-4278-b937-99fde70a366f>

Wed, 23 Feb 2011 16:31:06 +0000 (16:31 +0000)
author erickson <erickson@6d9bc8c9-1ec2-4278-b937-99fde70a366f>
Wed, 23 Feb 2011 16:31:06 +0000 (16:31 +0000)
committer erickson <erickson@6d9bc8c9-1ec2-4278-b937-99fde70a366f>
Wed, 23 Feb 2011 16:31:06 +0000 (16:31 +0000)
diff --git a/samples/web_spider.py b/samples/web_spider.py

index 4eb5ef2..0c82b42 100755 (executable)
--- a/samples/web_spider.py
+++ b/samples/web_spider.py
@@ -63,19 +63,25 @@ class WebSpiderScript(Script):
  
              return proto, host, path
  
-           
              
          def handle_starttag(self, tag, attrs): 
  
              if tag == 'a' and attrs: 
  
-                link = attrs[0][1] 
+
+                link = [h for h in attrs if h[0] == 'href']
+                if len(link) == 0: return
+                link = link[0][1] 
  
                  if link[:4] != "http": 
                      proto, host, path = self.url_parts(self.url)
  
+                    # Ignore href=javascript:foo and page anchors
+                    if link[:11] == 'javascript:' or link[:1] == '#':
+                        return
+
                      if link[:1] == '/': # full path
-                        link = "%s://%s%s" % (proto, host, link)
+                        path = link
  
                      elif link[:1] == '?': # GET params only
                          res = re.match('(.*)\?.*', path)
author	erickson <erickson@6d9bc8c9-1ec2-4278-b937-99fde70a366f>
	Wed, 23 Feb 2011 16:31:06 +0000 (16:31 +0000)
committer	erickson <erickson@6d9bc8c9-1ec2-4278-b937-99fde70a366f>
	Wed, 23 Feb 2011 16:31:06 +0000 (16:31 +0000)