diff --git a/api.py b/api.py index fb0c048..90e8f1b 100644 --- a/api.py +++ b/api.py @@ -2,7 +2,7 @@ # Copyright (c) 2020 James Shiffer # This file contains all the API calls made to archive.org. -import logging, re, requests, sched, time +import logging, re, requests, sched, time, json class ArchiveReaderClient: @@ -29,9 +29,9 @@ def borrow_book(self, book_id): 'action': 'browse_book', 'identifier': book_id }) - json = res.json() - if 'success' not in json: - err = json['error'] if 'error' in json else 'unknown error' + json_res = res.json() + if 'success' not in json_res: + err = json_res['error'] if 'error' in json_res else 'unknown error' logging.error('error with action browse_book: %s' % err) raise AssertionError @@ -40,14 +40,14 @@ def borrow_book(self, book_id): 'action': 'grant_access', 'identifier': book_id }) - json = res.json() - if 'success' not in json: - err = json['error'] if 'error' in json else 'unknown error' + json_res = res.json() + if 'success' not in json_res: + err = json_res['error'] if 'error' in json_res else 'unknown error' logging.error('error with action grant_access: %s' % err) raise AssertionError else: - logging.debug('received book token: %s' % json['value']) - self.token = json['value'] + logging.debug('received book token: %s' % json_res['value']) + self.token = json_res['value'] # Renews a loaned book, which must be borrowed before calling this method. @@ -63,14 +63,14 @@ def renew_book(self): 'action': 'create_token', 'identifier': self.book_id }) - json = res.json() - if 'success' not in json: - err = json['error'] if 'error' in json else 'unknown error' + json_res = res.json() + if 'success' not in json_res: + err = json_res['error'] if 'error' in json_res else 'unknown error' logging.error('error renewing book: %s' % err) raise AssertionError else: - logging.debug('renewed book token: %s' % json['token']) - self.token = json['token'] + logging.debug('renewed book token: %s' % json_res['token']) + self.token = json_res['token'] # Performs one renewal and schedules the next one for two minutes in the future. @@ -104,14 +104,18 @@ def fetch_book_metadata(self): logging.error('regex found no paths for BookReaderJSIA.php!') raise AssertionError + + # fix InvalidURL (No host supplied) error due to updated json response on archive.org + details_url = json.loads(match.group(1)) + details_url = details_url['url'] # call the endpoint and viola, we have all the info we could ever # want about our book. - res = self.session.get('https:' + match.group(1)) - json = res.json() - if 'data' not in json: + res = self.session.get('https:' + details_url) + json_res = res.json() + if 'data' not in json_res: logging.error('expected data in JSIA response but got none') raise AssertionError - self.book_meta = json['data'] + self.book_meta = json_res['data'] logging.debug('title: %s, imagecount: %s' % ( self.book_meta['metadata']['title'], self.book_meta['metadata']['imagecount'] @@ -155,10 +159,22 @@ def login(self, email, password): }, headers={ 'referer': self.URL_FORMAT % 'account/login' }) - json = res.json() - if json['status'] != 'ok': + json_res = res.json() + if json_res['status'] != 'ok': logging.error('login responded with status %s, message %s' % \ - (json['status'], json['message'])) + (json_res['status'], json_res['message'])) raise AssertionError else: logging.debug('user has logged in successfully') + + def return_book(self, book_id): + url = self.URL_FORMAT % 'services/loans/loan/' + res = self.session.post(url, { + 'action': 'return_loan', + 'identifier': book_id + }) + json_res = res.json() + if 'success' not in json_res: + err = json_res['error'] if 'error' in json_res else 'unknown error' + logging.error('error with action return_loan: %s' % err) + raise AssertionError diff --git a/ripper.py b/ripper.py old mode 100644 new mode 100755 index ca324f5..8ce89e6 --- a/ripper.py +++ b/ripper.py @@ -2,7 +2,7 @@ # Copyright (c) 2020 James Shiffer # This file contains the main application logic. -import argparse, api, getpass, logging, os, sys +import argparse, api, getpass, logging, os, random, sys, time def main(): client = api.ArchiveReaderClient() @@ -18,6 +18,8 @@ def main(): parser.add_argument('-s', '--page-start', type=int, help='Download pages starting at page number N and ending at the book\'s last page, or a range if --page-end has been specified') parser.add_argument('-e', '--page-end', type=int, help='End of the range of page numbers to download') parser.add_argument('-d', '--output-dir', help='Directory you want the pages to be written to. If undefined the directory will be named the book id') + parser.add_argument('-nt', '--no-timeout', action='store_true', help='Don\'t wait a few seconds between each image request. The default behaviour is to wait because if we request one image after the other with no timeout in between archive.org will typically drop the connection.') + parser.add_argument('-R', '--redownload', action='store_true', help='Redownloads pages even if they\'re already on disk') parser.add_argument('-S', '--scale', default=0, type=int, help='Image resolution of the pages requested, can save bandwidth if the best image quality isn\'t necessary. Higher integers mean smaller resolution, default is 0 (no downscaling)') args = parser.parse_args() @@ -48,12 +50,7 @@ def main(): dir = os.path.expanduser(args.output_dir) logging.debug('creating output dir "%s"' % dir) - if os.path.isdir(dir): - response = input('Output folder %s already exists. Continue? ' \ - % dir) - if not response.lower().startswith('y'): - return - else: + if not os.path.isdir(dir): os.mkdir(dir) page_count = client.fetch_book_metadata() @@ -77,16 +74,34 @@ def main(): logging.debug('planning on fetching pages %d thru %d' % (start, end)) total = end - start + completed = 0 + for i in range(start, end): - logging.debug('downloading page %d (index %d)' % (i + 1, - i)) - contents = client.download_page(i, args.scale) - with open('%s/%d.jpg' % (dir, i + 1), 'wb') as file: - file.write(contents) - done_count = i + 1 - start - print('%d%% (%d/%d) done' % (done_count / total * 100, done_count, total)) - - print('done') + savepath='%s/%d.jpg' % (dir, i + 1) + savepathnext='%s/%d.jpg' % (dir, i + 2) + logging.debug('downloading page %d (index %d)' % (i + 1, i)) + + #the logic here may seem complicated but it just checks if the file already exists before writing and + #downloads the last saved page even it exists because writing to file could've been interrupted + if (args.redownload or + (not os.path.isfile(savepath) or + (os.path.isfile(savepath) and not os.path.isfile(savepathnext)))): + contents = client.download_page(i, args.scale) + open(savepath, 'wb').write(contents) + + completed += 1 + print('Got %s (%d/%d)' % (savepath, completed, total)) + + #wait a little between requests otherwise they'll block us + if not args.no_timeout: + sleeptime=random.uniform(1,3) + time.sleep(sleeptime) + logging.debug('waiting %.1f sec between requests' % sleeptime) + else: + print('%s (%d/%d) already on disk, skipping' % (savepath, completed, total)) + print('Done.') + + client.return_book(id) if __name__ == '__main__': main()