return proto, host, path
-
def handle_starttag(self, tag, attrs):
if tag == 'a' and attrs:
- link = attrs[0][1]
+
+ link = [h for h in attrs if h[0] == 'href']
+ if len(link) == 0: return
+ link = link[0][1]
if link[:4] != "http":
proto, host, path = self.url_parts(self.url)
+ # Ignore href=javascript:foo and page anchors
+ if link[:11] == 'javascript:' or link[:1] == '#':
+ return
+
if link[:1] == '/': # full path
- link = "%s://%s%s" % (proto, host, link)
+ path = link
elif link[:1] == '?': # GET params only
res = re.match('(.*)\?.*', path)