Skip to content
This repository has been archived by the owner on Jul 1, 2022. It is now read-only.

Commit

Permalink
Keep track of downloaded pages, fix #15
Browse files Browse the repository at this point in the history
Now pages already visited will not be analyzed again (use overwrite to
force a full redownload)
  • Loading branch information
voyageur committed Jul 29, 2018
1 parent 13112ab commit aaf6249
Showing 1 changed file with 20 additions and 0 deletions.
20 changes: 20 additions & 0 deletions dagr/dagr.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

# This file is offered as-is, without any warranty.

import json
import re
import sys
from getopt import gnu_getopt, GetoptError
Expand Down Expand Up @@ -247,6 +248,18 @@ def get_images(self, mode, mode_arg, pages):
print(str(mkdir_error))
return

# Find previously downloaded pages
existing_pages = []
try:
with open(base_dir + "/.dagr_downloaded_pages", "r") as filehandle:
existing_pages = json.load(filehandle)
except FileNotFoundError as fnf_error:
# May not exist (new directory, ...)
pass
if not self.overwrite:
pages = [x for x in pages if x not in existing_pages]

print("Total deviations to download: " + str(len(pages)))
for count, link in enumerate(pages, start=1):
if self.verbose:
print("Downloading " + str(count) + " of " +
Expand All @@ -267,9 +280,16 @@ def get_images(self, mode, mode_arg, pages):
except DagrException as get_error:
self.handle_download_error(link, get_error)
continue
else:
if link not in existing_pages:
existing_pages.append(link)
else:
print(filelink)

# Update downloaded pages cache
with open(base_dir + "/.dagr_downloaded_pages", "w") as filehandle:
json.dump(existing_pages, filehandle)

def deviant_get(self, mode, mode_arg=None):
print("Ripping " + self.deviant + "'s " + mode + "...")

Expand Down

0 comments on commit aaf6249

Please sign in to comment.