-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_yt_crawl.py
executable file
·44 lines (34 loc) · 1.27 KB
/
test_yt_crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
#!/usr/bin/env python3
import asyncio
import logging
import os
from crawl_manager import CrawlManager
from saver import RedisItemSaver, FileItemSaver, TarItemSaver
from scraper import IDScraper, NullScraper
from tracker import InMemoryTracker, RedisTracker
async def main(cm):
await cm.initialize_tracker_with_items(
(line.strip() for line in open('test_data/youtube/known.txt', 'r')),
(line.strip() for line in open('test_data/youtube/explored.txt', 'r')),
)
await cm.crawl()
if __name__ == "__main__":
logging.basicConfig(level=logging.DEBUG)
tracker = InMemoryTracker()
# tracker = RedisTracker('redis://localhost')
scraper = IDScraper('https://www.youtube.com/annotations_invideo?video_id={}', r'v=([a-zA-Z0-9_-]{11})')
saver = TarItemSaver(tar_path='annotations.tar.gz', file_path_fmt='{0}.xml')
# saver = FileItemSaver(file_path_fmt=lambda fn: os.path.join('annotations', fn[:2], fn) + '.xml')
# saver = RedisItemSaver('redis://localhost')
cm = CrawlManager(
name='test_youtube_annotations',
num_workers=1,
tracker=tracker,
scraper=scraper,
saver=saver,
)
loop = asyncio.get_event_loop()
try:
loop.run_until_complete(main(cm))
finally:
loop.close()