-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathserverscrape.py
216 lines (200 loc) · 7.29 KB
/
serverscrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
#!/usr/bin/env python3
#
#CHANGE THESE VARIABLES TO CONFIGURE
#
influxHost = 'localhost'
influxPort = 8086
influxUser = 'username'
influxPassword = 'password'
elasticHost = 'localhost'
discordToken = 'your token here'
databasePrefix = 'serverscrape' # Script will create or use databases named databasePrefix_guildID
#Done with config variables
import discord
from elasticsearch import Elasticsearch, helpers
import sys
from influxdb import InfluxDBClient
import datetime
import json
influx = InfluxDBClient(influxHost, influxPort, influxUser, influxPassword)
elastic = Elasticsearch([
{'host':elasticHost}
])
client = discord.Client()
guildToScrape = int(sys.argv[1])
def buildElasticDocFromMessage(message):
indexName = '{0}_{1}'.format(databasePrefix, message.guild.id)
attachments = []
for attachment in message.attachments:
attachments.append(attachment.url)
measurement = 'chatMessage'
authorID = message.author.id
authorName = message.author.name
serverID = message.guild.id
serverName = message.guild.name
channelID = message.channel.id
channelName = message.channel.name
messageID = message.id
messageTextRaw = message.content
messageText = message.clean_content
messageLength = len(message.clean_content)
timestamp = int((message.created_at - datetime.datetime(1970, 1, 1)).total_seconds() * 1000)
createObj = {
"create": {
"_index":indexName,
"_id":message.id
}
}
docObject = {
"type":'default',
"timestamp":timestamp,
"messageSent":1,
"authorID":authorID,
"authorName":authorName,
"messageID":messageID,
"messageLength":messageLength,
"messageTextRaw":messageTextRaw,
"messageText":messageText,
"channelID":channelID,
"channelName":channelName,
"serverID":serverID,
"serverName":serverName,
"attachments":', '.join(attachments)
}
final = '{0}\n{1}'.format(json.dumps(createObj), json.dumps(docObject))
return final
def buildInfluxPointFromMessage(message):
# Grab a bunch of variables to assemble our influx-compatible data point
#
attachments = []
for attachment in message.attachments:
attachments.append(attachment.url)
measurement = 'chatMessage'
authorID = message.author.id
authorName = message.author.name
serverID = message.guild.id
serverName = message.guild.name
channelID = message.channel.id
channelName = message.channel.name
messageID = message.id
messageTextRaw = message.content
messageText = message.clean_content
messageLength = len(message.clean_content)
timestamp = int((message.created_at - datetime.datetime(1970, 1, 1)).total_seconds() * 1000)
object = {
"measurement": measurement,
"tags": {
"type":"default",
"authorID":authorID,
"authorName":authorName,
"serverID":serverID,
"serverName":serverName,
"channelID":channelID,
"channelName":channelName,
},
"time":timestamp,
"fields": {
"messageSent":1,
"authorID":authorID,
"authorName":authorName,
"messageID":messageID,
"messageLength":messageLength,
"messageTextRaw":messageTextRaw,
"messageText":messageText,
"channelID":channelID,
"channelName":channelName,
"serverID":serverID,
"serverName":serverName,
"attachments":', '.join(attachments)
}
}
return object
def checkListOfDict(list, key, value):
for dict in list:
if dict[key] == value:
return True
return False
async def influxDBInit(guildID):
dbName = '{0}_{1}'.format(databasePrefix,guildID)
print(influx.get_list_database())
if not checkListOfDict(influx.get_list_database(), 'name', dbName):
influx.create_database(dbName)
influx.switch_database(dbName)
print('Created influxDB database {0}'.format(dbName))
else:
influx.switch_database(dbName)
async def elasticInit(guildID):
indexName = '{0}_{1}'.format(databasePrefix, guildID)
elastic.indices.create(index=indexName, ignore=400)
async def scrapeChannel(channel):
dbName = '{0}_{1}'.format(databasePrefix, channel.guild.id)
print('Scraping channel {0}'.format(channel.name))
lastMessage = channel.last_message
iterator = 1000
lastGroup = False
messageCount = 0
batches = 0
influxBatchBuffer = []
elasticBatchBuffer = ''
while lastGroup == False:
# Grab 1000 messages, before the last message
# The last message being either the most recent message
# or the last message in the batch we last grabbed.
#
# If there's less than 1000 messages in a batch,
# assume we're at the end of the channel. Set the lastGroup
# variable so that our loop terminates.
#
messageBatch = await channel.history(limit=1000,before=lastMessage).flatten()
batches = batches + 1
print("{0} Messages scraped.".format(messageCount))
sys.stdout.write("\033[F")
iterator = 1000
if len(messageBatch) < 1000:
lastGroup = True
iterator = len(messageBatch)
for message in messageBatch:
influxLine = buildInfluxPointFromMessage(message)
elasticDoc = buildElasticDocFromMessage(message)
influxBatchBuffer.append(influxLine)
elasticBatchBuffer = '{0}\n{1}'.format(elasticBatchBuffer, elasticDoc)
messageCount = messageCount + 1
iterator = iterator - 1
if iterator == 0:
# This is the last message in a batch, so we set the new
# lastMessage as the baseline for our next batch grab, and write
# our 1000 data points to influx in a large batch job.
#
lastMessage = message
elastic.bulk(elasticBatchBuffer)
influx.write_points(influxBatchBuffer,database=dbName,protocol=u'json')
sys.stdout.write("\n")
print('{0} Total messages scraped.'.format(messageCount))
def getReadableChannels(guild):
member = guild.get_member(client.user.id)
channelsToScrape = []
for channel in guild.channels:
# Check if channel is text type, and we have message history rights
#
if str(channel.type) in 'text' and channel.permissions_for(member).read_message_history:
channelsToScrape.append(channel)
return channelsToScrape
@client.event
async def on_ready():
print('Connected as {0.user}'.format(client))
# Are we a member of the guild provided from the command line?
#
if client.get_guild(guildToScrape):
print('Found the provided guild.')
workingGuild = client.get_guild(guildToScrape)
await influxDBInit(workingGuild.id)
# Get only text channels, discarding topics, voice, whatever
#
readableChannels = getReadableChannels(workingGuild)
for channel in readableChannels:
# scrape and store messages in channels
#
await scrapeChannel(channel)
else:
print('{0.user} does not appear to be in the provided guild'.format(client) )
client.run(discordToken)