-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrootScraper.js
119 lines (96 loc) · 4.11 KB
/
rootScraper.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
function albumSubdir(date, title) {
let logger = console.log
logger = () => {} // Comment this out to see logging.
const months = {
January: "01",
February: "02",
March: "03",
April: "04",
May: "05",
June: "06",
July: "07",
August: "08",
September: "09",
October: "10",
November: "11",
December: "12",
};
logger("title: ", title);
logger("date: ", date);
const dateRe = /(\S+) (\d+), (\d+)/;
const out = dateRe.exec(date);
logger("out: ", out);
newDate = out[3]+"-"+months[out[1]]+"-"+out[2].padStart(2, '0');
return newDate + " - " + title.replace(/[/<>:"/\\|?*]/g, '_');
}
// Load the root URL, navigate to "All", then pull down the info about all of
// the albums.
async function scrapeRoot(page, rootUrl) {
let logger = console.log
logger = () => {} // Comment this out to see logging.
// TODO: Page id is also the last componend of the URL path. For instance,
// '5'.
const id_page = await page.$eval('html', h => h.getAttribute('id'));
const id = id_page.split("-")[0];
logger("id:", id);
// |count_sel| will match when all images are present, |pager_sel| will
// match when a paged interface is present. In the latter case, click "All"
// and wait for things to update.
const count_sel = '.navbar-paging>.all';
const pager_sel = '.navbar-paging>.navbar-prev';
logger("Waiting for: "+count_sel+','+pager_sel);
await page.waitForSelector(count_sel + ',' + pager_sel);
logger("Checking: "+pager_sel);
if (await page.$(pager_sel)) {
logger("Clicking All");
// "Click" on the "All" button.
await page.evaluate('Shr.AjaxDataGrid._16("All", '+id+')');
logger("Waiting for: "+count_sel);
await page.waitForSelector(count_sel);
}
let info = {}
// Map back to the URL this was fetched from.
info.url = rootUrl;
const title_sel = '#header-title';
info.title = await page.$eval(title_sel, item => item.innerText.trim());
logger("title:", info.title);
info.count = await page.$eval(count_sel, item => item.innerText.trim());
logger("count:", info.count);
const album_sel = '.pic-album';
const albums = await page.$$(album_sel);
info.albums = [];
for (let i = 0; i < albums.length; i++) {
const album = albums[i]
let albumInfo = {}
const id_sel = '.picAlbumTitle .i-edit';
albumInfo.id = await album.$eval(id_sel, item => item.getAttribute('s:menuargs'))
logger("id: ", albumInfo.id)
let title_sel = '.picAlbumTitle .pic-album-title';
albumInfo.title = await album.$eval(title_sel, item => item.innerText.trim());
logger("title: ", albumInfo.title);
const url_sel = '.picAlbumTitle .pic-album-title';
albumInfo.url = await album.$eval(url_sel, item => item.getAttribute('href'))
logger("url: ", albumInfo.url)
let count_sel = '.picAlbumTitle .i-eye';
albumInfo.count = await album.$eval(count_sel, item => item.innerText.trim());
logger("count: ", albumInfo.count);
let date_sel = '.pic-date';
albumInfo.date = await album.$eval(date_sel, item => item.innerText.trim());
logger("date: ", albumInfo.date);
let caption_sel = '.pic-album-text';
albumInfo.caption = await album.$eval(caption_sel, item => item.innerText.trim());
logger("caption: ", albumInfo.caption);
albumInfo.subdir = albumSubdir(albumInfo.date, albumInfo.title);
info.albums.push(albumInfo)
}
return info;
}
module.exports = (page, dir) => scrapeRoot(page, dir)
// TODO: Would it make sense to just scrape directly out of internal structures?
// Shr.P.sections[0].count == album count.
// Shr.P.sections[0].groups == albums.
// Shr.P.sections[0].groups[0].created == album 0 unix epoch
// Shr.P.sections[0].groups[0].count == album 0 picture count
// Shr.P.sections[0].groups[0].title == album 0 title
// Shr.P.sections[0].groups[0].text == album 0 caption
// Shr.P.sections[0].groups[0].coverPicture{} == album 0 cover pic info