forked from gstaxy/EpubToPdf
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgetpy.py
148 lines (112 loc) · 3.63 KB
/
getpy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
from bs4 import BeautifulSoup as bs
import os
import re
import ntpath
class GetEngine(object):
"""
This class contains the methods needed to get the files,
to help make the pdf file.
The class contains the following methods:
get_html() --- Which gets the html file names.
get_pdf() --- Which gets the pdf file names.
get_css() --- Which gets the css file names.
get_images() --- Which gets the image file names.
To create an instance of this object, pass in the name of the directory
that stores all the extracted files from the epub file.
"""
def __init__(self, directory):
#self.html_files = []
#self.css_files = []
#self.image_files = []
self.directory = directory
#self.files = []
self.pdf_files = []
self.pages = []
"""
def get_html(self):
for file in self.files:
if file.endswith(".xhtml") or file.endswith(".html"):
self.html_files.append(file)
"""
def get_pdf(self):
#for file in self.html_files:
# self.pdf_files.append("{}.pdf".format(self.html_files.index(file)))
for file in self.pages:
self.pdf_files.append("{}.pdf".format(self.pages.index(file)))
"""
def get_css(self):
for file in self.files:
if file.endswith(".css"):
self.css_files.append(file)
"""
"""
def get_images(self):
for file in self.files:
if file.endswith((".png", ".jpg", ".gif")):
self.image_files.append(file)
"""
"""
def get_all(self):
file = None
directory_paths = []
for root, dirs, files in os.walk(self.directory):
#This traverses the directory passed in as an argument,
#returns the current directory, the sub directories and all the files
directory_paths.append(root)
if file:
continue
for each in files:
if each.endswith(".opf"):
file = os.path.join(root, each)
continue
if not file:
return
xml_content = open(file, "r").read()
xml_tree = bs(xml_content, features = "xml")
file_names = xml_tree.package.manifest.findAll('item')
# Gets the name of all the documents in order
# from the opf file, then saves the file name with its path
# The file path in the opf file can't be relied upon
# Hence, the need to extract file name and get its path
for file in file_names:
file_path_match = re.match(r'.+\.[a-zA-Z]+', file.get('href', ''))
if not file_path_match:
continue
file_name = ntpath.basename(file_path_match.group())
for path in directory_paths:
filepath = path + '/' + file_name
if os.path.exists(filepath):
self.files.append(filepath)
"""
def get_pages(self):
file = None
directory_paths = []
for root, dirs, files in os.walk(self.directory):
#This traverses the directory passed in as an argument,
#returns the current directory, the sub directories and all the files
directory_paths.append(root)
if file:
continue
for each in files:
if each.endswith(".ncx"):
file = os.path.join(root, each)
continue
if not file:
return
xml_content = open(file, "r").read()
xml_tree = bs(xml_content, features = "xml")
page_targets = xml_tree.ncx.pageList.findAll('pageTarget')
# Gets the name of all the documents in order
# from the opf file, then saves the file name with its path
# The file path in the opf file can't be relied upon
# Hence, the need to extract file name and get its path
for page_target in page_targets:
page_src = page_target.find('content')['src']
file_path_match = re.match(r'.+\.[a-zA-Z]+', page_src)
if not file_path_match:
continue
file_name = ntpath.basename(file_path_match.group())
for path in directory_paths:
filepath = path + '/' + file_name
if os.path.exists(filepath):
self.pages.append(filepath)