-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler.py
78 lines (58 loc) · 1.98 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from html.parser import HTMLParser
from urllib.request import urlopen
from urllib import parse
import json
import requests
from LinkParser import LinkParser
from database.newssites import *
###########################################################
# database
dburl = "mongodb://localhost:27017/"
dbname = "newssites"
db = getdb(dburl,dbname)
################ FUNCTIONS #################
############ PARSERS ###############
# parser url, returning html page and list of hyperlinks
def parserURL(baseUrl,url):
# get html
header = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
"X-Requested-With": "XMLHttpRequest"
}
r = requests.get(url, headers=header)
htmlString = r.text
# parser for links
parser = LinkParser()
parser.baseUrl = baseUrl
parser.feed(htmlString)
return htmlString, parser.links
# parser html, returnig list of founded keywords
def parserHTML(keywords,html):
foundedkeywords = []
# search for keywords
for keyword in keywords:
if ( html.find(keyword) > -1 ):
foundedkeywords.append(keyword)
return foundedkeywords
############ SPIDER ###############
# recursive function to parse html
def spider(baseUrl, index):
# current url from frontier
url = getfrontier(db,index)
if url is None:
return
# parser one url and add new founded links to database
html, links = parserURL(baseUrl,url)
addfrontiers(db,links)
# see if frontier has changed
# print("frontier len:",len(getfrontiers(db)))
# parser one html and add to repository if keyword was founded
foundedkeywords = parserHTML(getkeywords(db),html)
if ( len(foundedkeywords) > 0 ):
addrepository(db,baseUrl,url,foundedkeywords)
# next url
index = index + 1
spider(baseUrl,index)
# see if repository has changed
#repository = getrepository(db)
#print("Repository:",repository,len(repository))