-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathanchor.py
83 lines (68 loc) · 2.43 KB
/
anchor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import urlparse
import htmlunit
from lazyproperty import lazyproperty
from link import Link, Links, AbstractLink
from ignore_urls import filterIgnoreUrlParts
from vectors import urlvector
from fakehtmlunitanchor import FakeHtmlUnitAnchor
class Anchor(Link):
def __init__(self, internal, reqresp):
# TODO: properly support it
attrs = list(internal.getAttributesMap().keySet())
for a in attrs:
if a.startswith("on") or a == "target":
internal.removeAttribute(a)
super(Anchor, self).__init__(internal, reqresp)
@lazyproperty
def href(self):
href = self.internal.getAttribute("href")
href = filterIgnoreUrlParts(href)
return href
@lazyproperty
def hrefurl(self):
return urlparse.urlparse(self.href)
def click(self):
if isinstance(self.internal, FakeHtmlUnitAnchor):
return self.internal.click()
else:
element = htmlunit.HtmlElement.cast_(self.internal)
return element.click(False, False, False, False)
@lazyproperty
def _str(self):
return "Anchor(%s, %s)" % (self.href, self.dompath)
@lazyproperty
def linkvector(self):
return urlvector(self.hrefurl)
class AbstractAnchor(AbstractLink):
def __init__(self, anchors):
if not isinstance(anchors, list):
anchors = list(anchors)
AbstractLink.__init__(self, anchors)
self.hrefs = set(i.href for i in anchors)
self.type = Links.Type.ANCHOR
self._href = None
def update(self, anchors):
oldlen = len(self.hrefs)
self.hrefs = set(i.href for i in anchors)
if oldlen != len(self.hrefs):
self._href = None
@property
def _str(self):
return "AbstractAnchor(%s, targets=%s)" % (self.hrefs, self.targets)
def equals(self, a):
return self.hrefs == a.hrefs
@lazyproperty
def hasquery(self):
return any(i.find('?') != -1 for i in self.hrefs)
@property
def href(self):
if not self._href:
if len(self.hrefs) == 1:
self._href = iter(self.hrefs).next()
else:
# return longest common substring from the beginning
for i, cc in enumerate(zip(*self.hrefs)):
if any(c != cc[0] for c in cc):
break
self._href = iter(self.hrefs).next()[:i]
return self._href