-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathURLDownloader.py
146 lines (113 loc) · 3.44 KB
/
URLDownloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import sqlite3
import datetime
from pywebcopy import WebPage, config
import pywebcopy
import yaml
import os.path
# pywebcopy docs found at: https://github.com/rajatomar788/pywebcopy
stream = open(r"PowerBeeConfig.yaml", 'r')
try:
PBconfig = yaml.safe_load(stream)
except yaml.YAMLError as exc:
print(exc)
downloaderConfig = PBconfig['downloader_config']
connection = sqlite3.connect(r"RedditArchive.db")
cursor = connection.cursor()
# Select URL from Downloads where DownloadStatus != 'DONE'
statement = """
SELECT d.SubredditID,
d.SubmissionID,
s.SubmissionTitle,
s.URL
FROM Downloads d
INNER JOIN submissions s ON (d.SubredditID = s.SubredditID) and (d.SubmissionID = s.SubmissionID)
WHERE s.URL is not NULL
AND
s.URL not LIKE '%reddit%'
AND
s.URL not LIKE '%imgur%'
AND
s.URL not LIKE '%.youtube.com%'
ORDER BY RANDOM()
LIMIT 20;
"""
cursor.execute(statement)
results = cursor.fetchall()
# Folder paths for pywebcopy must be absolute paths.
# RE: https://github.com/rajatomar788/pywebcopy/issues/13
# Incorporated rajatomar788's fix, 15 Jun 2019, thanks!
#
fetchDownloadStatusStatment = """
SELECT
LastDownloadAttempted,
LastDownloadCompleted,
DownloadStatus,
DownloadAttemptCount,
LinkControl,
ServerReply,
LocalAbsoluteFilePath
FROM Downloads
WHERE SubredditID = ? AND
SubmissionID = ?
"""
updateDownloadsStatement = """
UPDATE Downloads
SET
LastDownloadAttempted = ?,
LastDownloadCompleted = ?,
DownloadStatus = ?,
DownloadAttemptCount = ?,
LinkControl = ?,
ServerReply = ?,
LocalAbsoluteFilePath = ?
WHERE SubredditID = ? AND
SubmissionID = ?
"""
prefix = os.path.normpath(downloaderConfig['download_root'])
count = 0
# TODO: Locate index.html file from pywebcopy download
# TODO: Add cases for other utilities to download, like youtube-dl, Newspaper3k, twitter-text-python
# TODO: Add logins for spacific web sites, like imgur, nytimes, etc.
for result in results:
if result == None:
print('No more records from DB!')
break
print(result)
(SubredditID, SubmissionID, SubmissionTitle, URL) = result
save_folder = os.path.join(prefix, SubredditID, SubmissionID)
startDownloadTime = datetime.datetime.utcnow()
config.setup_config(URL, save_folder, 'pb')
wp = WebPage()
try:
wp.get(URL)
wp.save_complete()
except pywebcopy.exceptions.AccessError:
DownloadStatus = 'Bad'
endDownloadTime = datetime.datetime.utcnow()
print('HTML folder:' + str(wp.file_path))
cursor.execute(fetchDownloadStatusStatment, (SubredditID, SubmissionID))
(LastDownloadAttempted,
LastDownloadCompleted,
DownloadStatus,
DownloadAttemptCount,
LinkControl,
ServerReply,
LocalAbsoluteFilePath
) = cursor.fetchone()
LastDownloadAttempted = str(startDownloadTime)
LastDownloadCompleted = str(endDownloadTime)
LocalAbsoluteFilePath = str(wp.file_path)
DownloadStatus = 'Attempted'
cursor.execute(updateDownloadsStatement,
(LastDownloadAttempted,
LastDownloadCompleted,
DownloadStatus,
DownloadAttemptCount,
LinkControl,
ServerReply,
LocalAbsoluteFilePath,
SubredditID,
SubmissionID
)
)
connection.commit()