-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathnaivehtmlparser.py
84 lines (65 loc) · 2.16 KB
/
naivehtmlparser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/env python
"""
Python 3.x HTMLParser extension with ElementTree support.
"""
from html.parser import HTMLParser
from xml.etree import ElementTree
class NaiveHTMLParser(HTMLParser):
"""
Python 3.x HTMLParser extension with ElementTree support.
@see https://github.com/marmelo/python-htmlparser
"""
def __init__(self):
self.root = None
self.tree = []
HTMLParser.__init__(self)
def feed(self, data):
HTMLParser.feed(self, data)
return self.root
def handle_starttag(self, tag, attrs):
if len(self.tree) == 0:
element = ElementTree.Element(tag, dict(self.__filter_attrs(attrs)))
self.tree.append(element)
self.root = element
else:
element = ElementTree.SubElement(self.tree[-1], tag, dict(self.__filter_attrs(attrs)))
self.tree.append(element)
def handle_endtag(self, tag):
self.tree.pop()
def handle_startendtag(self, tag, attrs):
self.handle_starttag(tag, attrs)
self.handle_endtag(tag)
pass
def handle_data(self, data):
if self.tree:
self.tree[-1].text = data
def get_root_element(self):
return self.root
def __filter_attrs(self, attrs):
return filter(lambda x: x[0] and x[1], attrs) if attrs else []
# example usage
if __name__ == "__main__":
html = """
<html>
<head>
<title>GitHub</title>
</head>
<body>
<a href="https://github.com/marmelo">GitHub</a>
<a href="https://github.com/marmelo/python-htmlparser">GitHub Project</a>
</body>
</html>
"""
parser = NaiveHTMLParser()
root = parser.feed(html)
parser.close()
# root is an xml.etree.Element and supports the ElementTree API
# (e.g. you may use its limited support for XPath expressions)
# get title
print(root.find('head/title').text)
# get all anchors
for a in root.findall('.//a'):
print(a.get('href'))
# for more information, see:
# http://docs.python.org/2/library/xml.etree.elementtree.html
# http://docs.python.org/2/library/xml.etree.elementtree.html#xpath-support