-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraperController.js
275 lines (240 loc) · 10.3 KB
/
scraperController.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
const fs = require('fs');
const https = require('https');
const rootScraper = require('./rootScraper.js');
const albumScraper = require('./albumScraper.js');
const photoScraper = require('./photoScraper.js');
const findRoot = require('./findRoot.js');
// If cacheFile exists and can be read as JSON, return it. If not, call the
// generate function, and write the results into the cache file, then return
// them.
//
// If cacheFlag is false, refresh the cacheFile.
async function cacheOr(cacheFile, cacheFlag, generate) {
let logger = console.log
logger = () => {} // Comment this out to see logging.
if (cacheFlag) {
try {
fs.accessSync(cacheFile, fs.constants.F_OK);
data = fs.readFileSync(cacheFile);
const ret = JSON.parse(data);
// TODO: Consider a verifier here.
return ret;
} catch (err) {
// Only on ENOENT so that other kinds of failures don't lead to
// sudden fetch storms.
if (err.code != 'ENOENT') {
throw(err);
}
}
}
const ret = await generate();
logger("ret:", ret);
const tmpFile = cacheFile + ".tmp";
fs.writeFileSync(tmpFile, JSON.stringify(ret, null, 2));
fs.renameSync(tmpFile, cacheFile);
return ret;
}
async function scrapeAll(browserInstance, argv){
let logger = console.log
logger = () => {} // Comment this out to see logging.
if (!argv.dir) {
console.log("--dir=<dir> is required.");
process.exit(1);
}
const dirname = argv.dir;
{
try {
fs.accessSync(dirname, fs.constants.D_OK);
} catch(err) {
if (err.code == 'ENOENT') {
fs.mkdirSync(dirname);
} else {
console.log(err);
process.exit(1);
}
}
}
const rootUrl = argv.url;
if (!rootUrl) {
console.log("--url=<url> root URL required.");
process.exit(1);
}
let browser;
try{
browser = await browserInstance;
const pages = await browser.pages()
let page = pages[0];
// NOTE: I found that sometimes the scraper got wedged, and as best I
// could tell it was because of various tracking stuff. This was an
// attempt to weed out unnecessary cruft. It seems to have worked for
// me, but it's really not clear if it's the right approach.
await page.setRequestInterception(true);
page.on('request', intercepted_request => {
const url = intercepted_request.url();
// These all seem to not necessary.
if (url.startsWith('https://sb.scorecardresearch.com/') ||
url.startsWith('https://www.facebook.com/') ||
url.startsWith('https://connect.facebook.net/') ||
url.startsWith('https://www.mczbf.com/') ||
url.startsWith('https://udc-neb.kampyle.com/') ||
url.startsWith('https://assets.adobedtm.com/') ||
url.startsWith('https://www.googletagservices.com/') ||
url.startsWith('https://securepubads.g.doubleclick.net/') ||
false) {
intercepted_request.abort();
return;
}
// These seem to be necessary.
if (url.startsWith('data:') ||
url.startsWith('https://accounts.tinyprints.com/') ||
url.startsWith('https://accounts.shutterfly.com/') ||
url.startsWith('https://www.shutterfly.com/') ||
url.startsWith('https://beacon.shutterfly.com/') ||
url.startsWith('https://iam.shutterfly.com/') ||
url.startsWith('https://cmd.shutterfly.com/') ||
url.startsWith('https://os.shutterfly.com/') ||
url.startsWith('https://uniim-cp.shutterfly.com/') ||
url.startsWith('https://uniim-share.shutterfly.com/') ||
url.startsWith('https://cdn.staticsfly.com/') ||
url.startsWith('https://cld1.staticsfly.com/') ||
url.startsWith('https://cdn-stage.staticsfly.com/') ||
url.startsWith('https://cdn.optimizely.com/') ||
url.startsWith('https://fast.fonts.net/') ||
url.startsWith('https://ajax.googleapis.com/') ||
false) {
intercepted_request.continue();
return;
}
// TODO: Root share site URL is also necessary.
// Default allow, since there could be interstitial ads on login.
//console.log(intercepted_request.url());
if (true) {
intercepted_request.continue();
} else {
intercepted_request.abort();
}
});
// For reasons I'm not clear about, sometimes the browser just hangs
// waiting for the page to load. To "fix" it, use --login, then bring
// up the dev inspector, then click Network, then refresh the page, then
// double-click the request which hangs to open in a new tab. It
// doesn't fix things for the current run, but due to the userDataDir
// setting in browser.js, it will work for an hour or two.
if (argv.login) {
console.log("Login to the site and then close the browser");
await page.goto(rootUrl, {timeout: 0});
while (await page.waitForNavigation({timeout: 0})) {
console.log("Still waiting.");
}
process.exit(1);
}
// TODO: --album implies --root, and --photo implies --album. Should
// --nocache apply to only the most specific part, or to the entire
// stack?
const noCache = argv.nocache;
const rootInfoFile = dirname + "/Info.json";
const rootInfo = await cacheOr(rootInfoFile, !noCache, (
function (page, rootUrl){
return async () => {
if (!rootUrl) {
console.log("--url=<url> root URL required.");
process.exit(1);
}
console.log("root:", rootUrl);
await page.goto(rootUrl, {timeout: 0});
const albumsUrl = await findRoot(page, rootUrl);
await page.goto(albumsUrl, {timeout: 0});
return await rootScraper(page, albumsUrl);
}
}
)(page, rootUrl));
// Short-circuit if root-only was requested.
if (argv.root) {
await browser.close();
return;
}
let albumIds = {};
let photoIds = {};
if (argv.album) {
argv.album.split(",").forEach((id) => {
albumIds[id] = 1;
});
} else if (argv.photo) {
if (argv.photo.indexOf(":") == -1) {
console.log("--photo=albumId:photoId,photoId,photoId,...");
process.exit();
}
const a = argv.photo.split(":");
console.log(a);
albumIds[a[0]] = 1;
a[1].split(",").forEach((id) => {
photoIds[id] = 1;
});
}
for (const album of rootInfo.albums) {
if ((argv.album || argv.photo) && !albumIds[album.id]) {
continue;
}
logger("album:", album.id);
const albumDir = dirname + "/" + album.subdir;
try {
fs.accessSync(albumDir, fs.constants.F_OK);
} catch (err) {
fs.mkdirSync(albumDir);
}
const albumInfoFile = albumDir + "/Info.json";
logger("albumInfoFile:", albumInfoFile);
const albumInfo = await cacheOr(albumInfoFile, !noCache, (
function (page, url, id){
return async () => {
console.log("album:", url);
await page.goto(url, {timeout: 0});
return await albumScraper(page, url, id);
}
}
)(page, album.url, album.id));
for (const photo of albumInfo.photos) {
if (argv.nophotos || (argv.photo && !photoIds[photo.id])) {
continue;
}
logger("photo:", photo.id);
const photoInfoFile = albumDir + "/" + photo.id + ".json";
logger("photoInfoFile:", photoInfoFile);
const photoInfo = await cacheOr(photoInfoFile, !noCache, (
function (page, url, id){
return async () => {
console.log("photo:", url);
await page.goto(url, {timeout: 0});
info = await photoScraper.scrape(page, url, id);
const imgPath = albumDir + "/" + info.fname;
logger("photoImageFile:", imgPath);
const imgUrl = await photoScraper.downloadUrl(page);
logger("photoImgUrl:", imgUrl);
// Attempt to fetch the image.
const tmpPath = imgPath + ".tmp";
https.get(imgUrl, res => {
const stream = fs.createWriteStream(tmpPath);
res.pipe(stream);
stream.on('finish', () => {
stream.close();
fs.renameSync(tmpPath, imgPath);
});
});
// 2s pause for rate limit.
await page.waitForTimeout(2000);
return info;
}
}
)(page, photo.url, photo.id));
}
}
await browser.close();
} catch(err) {
// TODO: This is copy/paste from a project which was copy/paste from an
// example. Maybe it would make more sense to just not have the try
// block?
console.log("error:", err);
throw(err);
}
}
module.exports = (browserInstance, argv) => scrapeAll(browserInstance, argv)