-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathold.py
57 lines (51 loc) · 1.95 KB
/
old.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# This is a new function that we are creating to get links
# that our spider() function will call
def getLinks(self, url):
self.links = []
# Remember the base URL which will be important when creating
# absolute URLs
self.baseUrl = url
# Use the urlopen function from the standard Python 3 library
response = urlopen(url)
htmlBytes = response.read()
# Note that feed() handles Strings well, but not bytes
# (A change from Python 2.x to Python 3.x)
htmlString = htmlBytes.decode("utf-8")
self.feed(htmlString)
return htmlString, self.links
# And finally here is our spider. It takes in an URL, a word to find,
# and the number of pages to search through before giving up
def spider(url, maxPages):
frontier = [url]
index = 0
parser = LinkParser()
pages = {}
pages['link'] = []
keywords = [
'morte', 'morto', 'morre', 'acidente','assalto', 'hospital', 'medicamento'
]
while( index < maxPages and index < len(frontier) ):
#print(index,'Current page->',frontier[index])
data, links = parser.getLinks(frontier[index])
frontier = list(set(frontier + links))
#print('new frontier size:',len(frontier))
index = index + 1
founded_words = []
for keyword in keywords:
if ( data.find(keyword) > -1 ):
#print(keyword,' founded!')
founded_words.append(keyword)
if ( len(founded_words) > 0 ):
page = {'url':frontier[index],'keywords':founded_words}
pages['link'].append(page)
#print(founded_words,frontier[index])
#print()
#print(frontier[:maxPages])
print(json.dumps(pages, indent=4))
# execution
i = 10
while (i > 0):
spider("https://www.portaldoholanda.com.br/amazonas",10)
#spider("https://d24am.com/amazonas/",10)
#spider("https://www.acritica.com/channels/manaus",10)
i = i - 1