-
Notifications
You must be signed in to change notification settings - Fork 0
/
collect.ts
165 lines (140 loc) · 4.53 KB
/
collect.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import { chromium } from 'playwright'
import { ProgressCli } from '@beenotung/tslib/progress-cli'
import { createHash } from 'crypto'
import { join } from 'path'
import { mkdir, stat, writeFile } from 'fs/promises'
import { find, seedRow } from 'better-sqlite3-proxy'
import { proxy } from './proxy'
import { config } from './config'
let cli = new ProgressCli()
let getPage = async () => {
let browser = await chromium.launch({ headless: false })
let page = await browser.newPage()
getPage = async () => page
return page
}
export async function closeBrowser() {
let page = await getPage()
let browser = page.context().browser()
await page.close()
await browser?.close()
}
export async function collectByKeyword(
keyword: string,
options: { cli_prefix?: string } = {},
) {
let cli_prefix = options.cli_prefix || ''
cli.update(`${cli_prefix}searching "${keyword}"...`)
let dir = join(config.rootDir, keyword)
await mkdir(dir, { recursive: true })
let keyword_id = seedRow(proxy.keyword, { keyword })
let page = await getPage()
await page.goto('https://images.google.com', {
waitUntil: 'domcontentloaded',
})
await page.fill('form textarea[name="q"]', keyword)
await page.click('form button[type="submit"]')
await page.waitForURL(/^https:\/\/www\.google\.com\/search/, {
waitUntil: 'domcontentloaded',
})
type ImageItem = Awaited<ReturnType<typeof collectImages>>[number]
async function collectImages() {
let images = await page.evaluate(async () => {
let items = document.querySelectorAll<HTMLElement>('[data-lpage]')
let images = []
for (let item of items) {
let page_url = item.dataset.lpage!
let img = item.querySelector('img')!
let loading_gif =
''
while (img.src == loading_gif) {
img.scrollIntoView({
behavior: 'smooth',
block: 'center',
})
await new Promise(resolve => setTimeout(resolve, 500))
}
images.push({ page_url, image_src: img.src, alt: img.alt })
}
return images
})
return images
}
async function scrollToBottom() {
await page.evaluate(async () => {
for (;;) {
let imgs = document.querySelectorAll<HTMLElement>(
'[data-lpage] g-img img',
)
let img = imgs[imgs.length - 1]
if (!img) return
img.scrollIntoView({ behavior: 'smooth', block: 'center' })
await new Promise(resolve => setTimeout(resolve, 500))
let bars = document.querySelectorAll('[role="progressbar"]')
let bar = bars[bars.length - 1]
if (!bar) return
let rect = bar.getBoundingClientRect()
let size = rect.width * rect.height
if (size == 0) return
await new Promise(resolve => setTimeout(resolve, 500))
}
})
}
async function saveImage(image: ImageItem) {
let { page_url, image_src, alt } = image
let res = await fetch(image_src)
let mimeType = res.headers.get('Content-Type')
if (!mimeType?.startsWith('image/')) {
return
}
let ext = mimeType.split('/')[1].split(';')[0]
let binary = await res.arrayBuffer()
let buffer = Buffer.from(binary)
let hash = createHash('sha256')
hash.write(buffer)
let filename = hash.digest().toString('hex') + '.' + ext
let file = join(dir, filename)
let row = find(proxy.image, { filename })
if (row) {
return
}
let fileSize = await getFileSize(file)
if (fileSize != buffer.length) {
await writeFile(file, buffer)
}
let domain = new URL(page_url).hostname
let domain_id = seedRow(proxy.domain, { domain })
let page_id = seedRow(proxy.page, { url: page_url }, { domain_id })
proxy.image.push({ filename, page_id, keyword_id, alt, embedding: null })
}
let lastCount = 0
let attempt = 0
for (;;) {
let images = await collectImages()
let count = images.length
if (count != lastCount) {
attempt = 0
}
if (count > 0 && count == lastCount) {
attempt++
if (attempt > 3) {
break
}
}
cli.update(`${cli_prefix}searching "${keyword}": ${count} images ...`)
for (let image of images.slice(lastCount)) {
await saveImage(image)
}
await scrollToBottom()
lastCount = count
}
cli.update(`${cli_prefix}searched "${keyword}": ${lastCount} images.`)
cli.nextLine()
}
async function getFileSize(file: string) {
try {
return (await stat(file)).size
} catch (error) {
return 0
}
}