-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathutils.py
213 lines (155 loc) · 6.75 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
import dgl
import numpy as np
import networkx as nx
import torch
from sklearn.decomposition import PCA
from tqdm import tqdm
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(font_scale = 1.5)
sns.set_theme()
def build_karate_club_graph():
'''
All 78 edges are stored in two numpy arrays, one for the source endpoint and the other for the target endpoint
'''
src = np.array([1, 2, 2, 3, 3, 3, 4, 5, 6, 6, 6, 7, 7, 7, 8, 8, 9, 10, 10,
10, 11, 12, 12, 13, 13, 13, 13, 16, 16, 17, 17, 19, 19, 21, 21,
25, 25, 27, 27, 27, 28, 29, 29, 30, 30, 31, 31, 31, 31, 32, 32,
32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33,
33, 33, 33, 33, 33, 33, 33, 33, 33, 33])
dst = np.array([0, 0, 1, 0, 1, 2, 0, 0, 0, 4, 5, 0, 1, 2, 3, 0, 2, 2, 0, 4,
5, 0, 0, 3, 0, 1, 2, 3, 5, 6, 0, 1, 0, 1, 0, 1, 23, 24, 2, 23,
24, 2, 23, 26, 1, 8, 0, 24, 25, 28, 2, 8, 14, 15, 18, 20, 22, 23,
29, 30, 31, 8, 9, 13, 14, 15, 18, 19, 20, 22, 23, 26, 27, 28, 29, 30,
31, 32])
#Edges are directional in DGL; make them bidirectional
u = np.concatenate([src, dst])
v = np.concatenate([dst, src])
#Building diagram
return dgl.graph((u, v))
"""
logits: [ [[embeddings], [probability tensor]] x 200 epochs]
"""
#iterate through each epoch
def draw(iteration: int, all_logits: torch.tensor, nodelist: [int], nx_G, ax, coloring) -> None:
"""
draws graphs from epoch iteration, given the node list of prelabeled nodes
ARGS:
all_logits: embeddings generated by model
iteration: the iteration number for drawing
nodelist: the prelabeled nodes from the training step
nx_G: this is the graph
colors: colors of each label
"""
#save the embedding
embed = {}
colors = []
if len(coloring) > 2:
for j in range(nx_G.number_of_nodes()):
cls = all_logits[iteration][j].numpy().argmax()
colors.append(coloring[cls])
pca_breast = PCA(n_components=2)
final_data = pca_breast.fit_transform(all_logits[iteration].numpy())
for j in range(nx_G.number_of_nodes()):
embed[j] = final_data[j]
else:
for j in range(nx_G.number_of_nodes()):
embed[j] = all_logits[iteration][j].numpy()
cls = all_logits[iteration][j].numpy().argmax()
colors.append(coloring[cls])
#clear from previous graph
ax.cla()
#edit the plot
ax.set_title('Epoch: %d'% iteration, fontdict = {"fontsize": 8})
#draw the nodes classified from the ML
nx.draw_networkx(nx_G.to_undirected(), embed, node_color=colors,
with_labels=False, node_size=20, ax=ax, width = 0.03, edgecolors="grey")
#draw the prelabeled nodes
nx.draw_networkx_nodes(nx_G.to_undirected(), embed, nodelist = nodelist, node_color="black", node_size=50, ax=ax)
ax.tick_params(left=False, bottom=False, labelleft=True, labelbottom=True)
ax.tick_params(axis='x', labelsize=7)
ax.tick_params(axis='y', labelsize=7)
#ax.legend()
def write_confusion_matrix(iteration: int, all_logits: torch.tensor, nx_G, true_labels: pd.DataFrame, classes: dict, ax, cbar_ax):
pred_classes = []
true_labels = true_labels["team"].tolist()
#find class length for division of confusion matrix into percentage
class_len = []
for i in range(len(classes)):
class_len.append(true_labels.count(classes[i]))
#get what model thinks of classes
#UNSURE IF THESE ARE IN ORDER etc. [node0: embed0, node1: embed1 ...] etc need to check
for j in range(nx_G.number_of_nodes()):
cls = all_logits[iteration][j].numpy().argmax()
pred_classes.append(classes[cls])
#pred classes = [predictions from the model]
#true classes = [neo4j labels]
#classes = {0: neo4j label, 1: neo4j label, 2: neo4j label...}
confusion_matrix = {}
#create confusion matrix
"""
final confusion matrix
predicted
A| B | C
true x|
y|
z|
currernt confusion matrix {A: [x, y, z], B: [x, y, z], C: [x, y, z]}
"""
#iterate through the dict keys
for i in range(len(classes)):
#for confusion matrix dictionary
column_list = []
#predicted class
#ex) for i = 1, this would be A = "UCLA ATHLETICS"
label1 = classes[i]
#iterate through the list of each dict key
for j in range(len(classes)):
#count occurrences of label 1 x label 1
count = 0
#for i = 1, this would be x = 'UCLA ATHLETICS"
label2 = classes[j]
#iterate through both at once
for x, y in zip(true_labels, pred_classes):
#label 1 is the predicted, label2 is the real
if y == label1 and x == label2:
count += 1
#finished counting, now append
column_list.append(count)
#after calculation, append to matrix
confusion_matrix[classes[i]] = column_list
confusion_matrix["class_len"] = class_len
confusion_df = pd.DataFrame(confusion_matrix)
#percentage
confusion_df = confusion_df[confusion_df.columns[:-1]].div(confusion_df.class_len, axis=0)
confusion_df.index = confusion_df.columns.to_list()
ax.cla()
#plot
sns.heatmap(data = confusion_df, alpha=0.9, cmap = "magma", yticklabels = confusion_df.index, xticklabels = confusion_df.columns, annot = True, ax = ax, cbar = True, cbar_ax = cbar_ax, vmin = 0, vmax = 1)
#edit the plot
ax.set_title('Confusion Matrix for Epoch: %d'% iteration, fontdict = {"fontsize": 15})
ax.tick_params(axis='x', labelsize=10)
ax.tick_params(axis='y', labelsize=10)
ax.set_xlabel("Predicted Labels", fontdict={'fontsize':12})
ax.set_ylabel("True Labels", fontdict={'fontsize':12})
def hierarchical_clustering(iteration: int, all_logits: np.array) -> None:
"""
ARGS:
iterations: an int that specifies which iteration you want to cluster
all_logits: numpy array(output tensor) that holds all embeddings from the model
RETURNS:
None!
But draws a hierarchical_clustering plot
"""
node_embeddings = all_logits[iteration]
pass
if __name__ == "__main__":
G = build_karate_club_graph()
print('We have %d nodes.' % G.number_of_nodes())
print('We have %d edges.' % G.number_of_edges())
#Visualize the graph by converting it into a networkx graph
nx_G = G.to_networkx().to_undirected()
pos = nx.kamada_kawai_layout(nx_G)
nx.draw(nx_G, pos, with_labels=True, node_color=[[.7, .7, .7]])
plt.savefig('graph_vis/karate.png')