-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpiazza_network_scraper.py
153 lines (129 loc) · 5.77 KB
/
piazza_network_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
from piazza_api import Piazza
import networkx as nx
import pprint
from functools import reduce
from operator import add
import time
import logging
import sys
logging.basicConfig(filename='session_log.txt',
filemode='a',
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s',
datefmt='%H:%M:%S',
level=logging.DEBUG)
rootLogger = logging.getLogger()
consoleHandler = logging.StreamHandler(sys.stdout)
consoleHandler.setLevel(level=logging.INFO)
rootLogger.addHandler(consoleHandler)
p = Piazza()
print('Please enter your Piazza login credentials.')
p.user_login()
add_another_class = True
classes = dict()
pp = pprint.PrettyPrinter(indent=4)
while add_another_class:
print('Classes added so far:')
pp.pprint(classes)
class_code = input('Please enter the course code or enter to start parsing. To find the course code, visit the Piazza class and copy the last part of the URL (i.e for https://piazza.com/class/asdfghjkl, paste in \'asdfghjkl\'): ')
if class_code == '':
add_another_class = False
continue
class_name = input('Please enter the class name for this code: ')
classes[class_code] = class_name
continue_input = input(
'Enter "begin" to start parsing the Piazza networks OR press enter to add another class: ')
if continue_input == 'begin':
add_another_class = False
total_posts = 0
# TODO refactor this procedural logic into a more readable class
for class_code, class_name in classes.items():
piazza_class = p.network(class_code)
output_file = class_name + '_network.gexf'
feed = piazza_class.get_feed(limit=10000)
cids = [post['id'] for post in feed["feed"]]
edges = dict()
nodes = set()
node_sizes = dict()
node_interactions = dict()
post = None
rootLogger.info('Parsing class %s with code %s', class_name, class_code)
i = 0
for post_id in cids:
if total_posts > 0 and total_posts % 60 == 0:
rootLogger.info('Waiting for Piazza response...')
time.sleep(15)
try_fetch = True
attempt = 0
max_attempts = 20
while try_fetch and attempt < max_attempts:
try:
time.sleep(attempt)
post = piazza_class.get_post(post_id)
try_fetch = False
except:
attempt += 1
rootLogger.info('sleep for %d and retry %s', attempt, post_id)
continue
if attempt >= max_attempts:
rootLogger.info('skip %s', post_id)
continue
i += 1
total_posts += 1
rootLogger.info('Processing post %s, num %d: ', post_id, i)
assert(post['change_log'][0]['type'] == 'create')
if 'uid' not in post['change_log'][0]:
rootLogger.debug('missing uid for %s', post_id)
continue
author = hash(post['change_log'][0]['uid'])
nodes.add(post['change_log'][0]['uid'])
node_sizes[author] = node_sizes.get(author, 1) + 10
node_interactions[author] = node_interactions.get(author, 1) + 1
followups = post['children']
rootLogger.debug('Num followups: ' + str(len(followups)))
for followup in followups:
if 'uid' not in followup:
# anonymous
continue
follower = hash(followup['uid'])
nodes.add(followup['uid'])
node_sizes[follower] = node_sizes.get(follower, 1) + 2
node_interactions[follower] = node_interactions.get(
follower, 1) + 1
if not follower in edges:
edges[follower] = dict()
edges[follower][author] = edges[follower].get(author, 0) + 1
# TODO add timestamps / dates to edges
comments = followup['children']
rootLogger.debug('Num comments: ' + str(len(comments)))
for comment in comments:
if 'uid' not in comment:
# anonymous
continue
commentor = hash(comment['uid'])
nodes.add(comment['uid'])
node_sizes[commentor] = node_sizes.get(commentor, 1) + 1
node_interactions[commentor] = node_interactions.get(
commentor, 1) + 1
if not commentor in edges:
edges[commentor] = dict()
edges[commentor][follower] = edges[commentor].get(
follower, 0) + 1
node_data = piazza_class.get_users(list(nodes))
node_roles = {hash(node['id']): node['role'] for node in node_data}
G = nx.DiGraph() # Initialize a directed graph object
G.add_nodes_from([(hash(node), {'color': node_roles[hash(node)], 'size':node_sizes[hash(node)],
'interactions':node_interactions[hash(node)]}) for node in nodes]) # Add nodes to the Graph
G.add_weighted_edges_from(reduce(
add, [[(a, b, edges[a][b]) for b in edges[a]] for a in edges])) # Add edges to the Graph
rootLogger.info(nx.info(G)) # Print information about the Graph
nx.write_gexf(G, output_file)
# plot centralities on another graph where colour is still role, but size is sum of these 3 centralities. Assumption is that 'leaders' are the TAs / instructors
# betweenness (how often they appear on route between other students)
rootLogger.info('Betweenness centrality: ' +
str(nx.centrality.betweenness_centrality(G)))
# closeness (how many other students have a short path to that node)
rootLogger.info('Closeness centrality: ' +
str(nx.centrality.closeness_centrality(G)))
# degree (number of connections they have)
rootLogger.info('Degree centrality: ' +
str(nx.centrality.degree_centrality(G)))