-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwebscraping.py
executable file
·66 lines (57 loc) · 2.18 KB
/
webscraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
from bs4 import BeautifulSoup
from requests import get
from contextlib import closing
import os
import re
import io
"""Funcion para realizar peticiones web"""
def get_req(url):
with closing(get(url, stream = True)) as resp:
html = BeautifulSoup(resp.content, 'html.parser')
return html
# URLs
mainUrl = "http://www.ijmlc.org/list-6-1.html"
urlsLists = list()
urlsArticles = list()
articleNames = list()
# Peticion a la URL principal
mainHtml = get_req(mainUrl)
for link in mainHtml.find_all('a'):
match = re.match('^http://www.ijmlc.org/list-[0-9]+-[0-9]+.html$', \
link.get('href'))
if match:
urlsLists.append(match[0])
# Peticiones a las URLs con listas de articulos
for urlList in urlsLists[:50]:
listHtml = get_req(urlList)
for link in listHtml.find_all('a'):
if link.get('href') != None:
match = re.match('^http://www.ijmlc.org/index.php\?m=content\&c=index\&a=show\&catid=[0-9]+\&id=[0-9]+$', link.get('href'))
if match:
urlsArticles.append(match[0])
# Peticiones a las URLs con los articulos en PDFs
for urlArticle in urlsArticles:
articleHtml = get_req(urlArticle)
for link in articleHtml.find_all('a'):
if link.get('href') != None:
match = re.match('^vol[0-9]+/[0-9]+-[A-Z]+[0-9]+\.pdf$', \
link.get('href'))
if match:
articleNames.append(match[0])
# Descarga de los archivos PDF
for articleName in articleNames[:250]:
with closing(get('http://www.ijmlc.org/' + articleName, \
stream = True)) as resp:
nameArchive = re.search('^vol[0-9]+/([0-9]+-[A-Z]+[0-9]+)\.pdf$', \
articleName)
with open('pdf/' + nameArchive.group(1) + '.pdf', 'wb') as pdfFile:
for chunk in resp.iter_content(chunk_size=1024):
if chunk:
pdfFile.write(chunk)
# Obtencion del texto de los PDFs
os.chdir('./corpus')
for pdf in os.listdir('../pdf'):
nameArchive = re.search('^([0-9]+-[A-Z]+[0-9]+)\.pdf$', \
pdf)
bash = "pdf2txt.py -o " + nameArchive.group(1) + ".txt " + "../pdf/" + pdf
os.system(bash)