Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 37 additions & 21 deletions api.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# Copyright (c) 2020 James Shiffer
# This file contains all the API calls made to archive.org.

import logging, re, requests, sched, time
import logging, re, requests, sched, time, json

class ArchiveReaderClient:

Expand All @@ -29,9 +29,9 @@ def borrow_book(self, book_id):
'action': 'browse_book',
'identifier': book_id
})
json = res.json()
if 'success' not in json:
err = json['error'] if 'error' in json else 'unknown error'
json_res = res.json()
if 'success' not in json_res:
err = json_res['error'] if 'error' in json_res else 'unknown error'
logging.error('error with action browse_book: %s' % err)
raise AssertionError

Expand All @@ -40,14 +40,14 @@ def borrow_book(self, book_id):
'action': 'grant_access',
'identifier': book_id
})
json = res.json()
if 'success' not in json:
err = json['error'] if 'error' in json else 'unknown error'
json_res = res.json()
if 'success' not in json_res:
err = json_res['error'] if 'error' in json_res else 'unknown error'
logging.error('error with action grant_access: %s' % err)
raise AssertionError
else:
logging.debug('received book token: %s' % json['value'])
self.token = json['value']
logging.debug('received book token: %s' % json_res['value'])
self.token = json_res['value']


# Renews a loaned book, which must be borrowed before calling this method.
Expand All @@ -63,14 +63,14 @@ def renew_book(self):
'action': 'create_token',
'identifier': self.book_id
})
json = res.json()
if 'success' not in json:
err = json['error'] if 'error' in json else 'unknown error'
json_res = res.json()
if 'success' not in json_res:
err = json_res['error'] if 'error' in json_res else 'unknown error'
logging.error('error renewing book: %s' % err)
raise AssertionError
else:
logging.debug('renewed book token: %s' % json['token'])
self.token = json['token']
logging.debug('renewed book token: %s' % json_res['token'])
self.token = json_res['token']


# Performs one renewal and schedules the next one for two minutes in the future.
Expand Down Expand Up @@ -104,14 +104,18 @@ def fetch_book_metadata(self):
logging.error('regex found no paths for BookReaderJSIA.php!')
raise AssertionError


# fix InvalidURL (No host supplied) error due to updated json response on archive.org
details_url = json.loads(match.group(1))
details_url = details_url['url']
# call the endpoint and viola, we have all the info we could ever
# want about our book.
res = self.session.get('https:' + match.group(1))
json = res.json()
if 'data' not in json:
res = self.session.get('https:' + details_url)
json_res = res.json()
if 'data' not in json_res:
logging.error('expected data in JSIA response but got none')
raise AssertionError
self.book_meta = json['data']
self.book_meta = json_res['data']
logging.debug('title: %s, imagecount: %s' % (
self.book_meta['metadata']['title'],
self.book_meta['metadata']['imagecount']
Expand Down Expand Up @@ -155,10 +159,22 @@ def login(self, email, password):
}, headers={
'referer': self.URL_FORMAT % 'account/login'
})
json = res.json()
if json['status'] != 'ok':
json_res = res.json()
if json_res['status'] != 'ok':
logging.error('login responded with status %s, message %s' % \
(json['status'], json['message']))
(json_res['status'], json_res['message']))
raise AssertionError
else:
logging.debug('user has logged in successfully')

def return_book(self, book_id):
url = self.URL_FORMAT % 'services/loans/loan/'
res = self.session.post(url, {
'action': 'return_loan',
'identifier': book_id
})
json_res = res.json()
if 'success' not in json_res:
err = json_res['error'] if 'error' in json_res else 'unknown error'
logging.error('error with action return_loan: %s' % err)
raise AssertionError
47 changes: 31 additions & 16 deletions ripper.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# Copyright (c) 2020 James Shiffer
# This file contains the main application logic.

import argparse, api, getpass, logging, os, sys
import argparse, api, getpass, logging, os, random, sys, time

def main():
client = api.ArchiveReaderClient()
Expand All @@ -18,6 +18,8 @@ def main():
parser.add_argument('-s', '--page-start', type=int, help='Download pages starting at page number N and ending at the book\'s last page, or a range if --page-end has been specified')
parser.add_argument('-e', '--page-end', type=int, help='End of the range of page numbers to download')
parser.add_argument('-d', '--output-dir', help='Directory you want the pages to be written to. If undefined the directory will be named the book id')
parser.add_argument('-nt', '--no-timeout', action='store_true', help='Don\'t wait a few seconds between each image request. The default behaviour is to wait because if we request one image after the other with no timeout in between archive.org will typically drop the connection.')
parser.add_argument('-R', '--redownload', action='store_true', help='Redownloads pages even if they\'re already on disk')
parser.add_argument('-S', '--scale', default=0, type=int, help='Image resolution of the pages requested, can save bandwidth if the best image quality isn\'t necessary. Higher integers mean smaller resolution, default is 0 (no downscaling)')
args = parser.parse_args()

Expand Down Expand Up @@ -48,12 +50,7 @@ def main():
dir = os.path.expanduser(args.output_dir)

logging.debug('creating output dir "%s"' % dir)
if os.path.isdir(dir):
response = input('Output folder %s already exists. Continue? ' \
% dir)
if not response.lower().startswith('y'):
return
else:
if not os.path.isdir(dir):
os.mkdir(dir)

page_count = client.fetch_book_metadata()
Expand All @@ -77,16 +74,34 @@ def main():
logging.debug('planning on fetching pages %d thru %d' % (start, end))

total = end - start
completed = 0

for i in range(start, end):
logging.debug('downloading page %d (index %d)' % (i + 1,
i))
contents = client.download_page(i, args.scale)
with open('%s/%d.jpg' % (dir, i + 1), 'wb') as file:
file.write(contents)
done_count = i + 1 - start
print('%d%% (%d/%d) done' % (done_count / total * 100, done_count, total))

print('done')
savepath='%s/%d.jpg' % (dir, i + 1)
savepathnext='%s/%d.jpg' % (dir, i + 2)
logging.debug('downloading page %d (index %d)' % (i + 1, i))

#the logic here may seem complicated but it just checks if the file already exists before writing and
#downloads the last saved page even it exists because writing to file could've been interrupted
if (args.redownload or
(not os.path.isfile(savepath) or
(os.path.isfile(savepath) and not os.path.isfile(savepathnext)))):
contents = client.download_page(i, args.scale)
open(savepath, 'wb').write(contents)

completed += 1
print('Got %s (%d/%d)' % (savepath, completed, total))

#wait a little between requests otherwise they'll block us
if not args.no_timeout:
sleeptime=random.uniform(1,3)
time.sleep(sleeptime)
logging.debug('waiting %.1f sec between requests' % sleeptime)
else:
print('%s (%d/%d) already on disk, skipping' % (savepath, completed, total))
print('Done.')

client.return_book(id)

if __name__ == '__main__':
main()