Skip to content

Commit

Permalink
1. Don't re-download pages by default 2. Sleep for a few seconds betw…
Browse files Browse the repository at this point in the history
…een requests to prevent being blocked

1. adds the -R flag
2. Should fix scoliono#1 and
   adds the -nt flag
  • Loading branch information
notevenaperson committed Sep 13, 2021
1 parent 18f4154 commit 8569763
Showing 1 changed file with 24 additions and 9 deletions.
33 changes: 24 additions & 9 deletions ripper.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# Copyright (c) 2020 James Shiffer
# This file contains the main application logic.

import argparse, api, getpass, logging, os, sys
import argparse, api, getpass, logging, os, random, sys, time

def main():
client = api.ArchiveReaderClient()
Expand All @@ -18,6 +18,8 @@ def main():
parser.add_argument('-s', '--page-start', type=int, help='Download pages starting at page number N and ending at the book\'s last page, or a range if --page-end has been specified')
parser.add_argument('-e', '--page-end', type=int, help='End of the range of page numbers to download')
parser.add_argument('-d', '--output-dir', help='Directory you want the pages to be written to. If undefined the directory will be named the book id')
parser.add_argument('-nt', '--no-timeout', action='store_true', help='Don\'t wait a few seconds between each image request. The default behaviour is to wait because if we request one image after the other with no timeout in between archive.org will typically drop the connection.')
parser.add_argument('-R', '--redownload', action='store_true', help='Redownloads pages even if they\'re already on disk')
parser.add_argument('-S', '--scale', default=0, type=int, help='Image resolution of the pages requested, can save bandwidth if the best image quality isn\'t necessary. Higher integers mean smaller resolution, default is 0 (no downscaling)')
args = parser.parse_args()

Expand Down Expand Up @@ -77,15 +79,28 @@ def main():
logging.debug('planning on fetching pages %d thru %d' % (start, end))

total = end - start
for i in range(start, end):
logging.debug('downloading page %d (index %d)' % (i + 1,
i))
contents = client.download_page(i, args.scale)
with open('%s/%d.jpg' % (dir, i + 1), 'wb') as file:
file.write(contents)
done_count = i + 1 - start
print('%d%% (%d/%d) done' % (done_count / total * 100, done_count, total))

for i in range(start, end):
savepath='%s/%d.jpg' % (dir, i + 1)
savepathnext='%s/%d.jpg' % (dir, i + 2)
logging.debug('downloading page %d (index %d)' % (i + 1, i))

#the logic here may seem complicated but it just checks if the file already exists before writing and
#downloads the last saved page even it exists because writing to file could've been interrupted
if (args.redownload or
(not os.path.isfile(savepath) or
(os.path.isfile(savepath) and not os.path.isfile(savepathnext)))):
contents = client.download_page(i, str(args.scale))
open(savepath, 'wb').write(contents)
print('%d%% (%d/%d) done' % ((i + 1) / total * 100, i + 1, total))

#wait a little between requests otherwise they'll block us
if not args.no_timeout:
sleeptime=random.uniform(1,3)
time.sleep(sleeptime)
logging.debug('waiting %.1f sec between requests' % sleeptime)
else:
print('%d%% (%d/%d) already on disk, skipping' % ((i + 1) / total * 100, i + 1, total))
print('done')

if __name__ == '__main__':
Expand Down

0 comments on commit 8569763

Please sign in to comment.