utils.py

import dgl
import numpy as np
import networkx as nx
import torch

from sklearn.decomposition import PCA
from tqdm import tqdm
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

sns.set(font_scale = 1.5)
sns.set_theme()


def build_karate_club_graph():
    '''
    All 78 edges are stored in two numpy arrays, one for the source endpoint and the other for the target endpoint
    '''
    src = np.array([1, 2, 2, 3, 3, 3, 4, 5, 6, 6, 6, 7, 7, 7, 8, 8, 9, 10, 10,
        10, 11, 12, 12, 13, 13, 13, 13, 16, 16, 17, 17, 19, 19, 21, 21,
        25, 25, 27, 27, 27, 28, 29, 29, 30, 30, 31, 31, 31, 31, 32, 32,
        32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33,
        33, 33, 33, 33, 33, 33, 33, 33, 33, 33])
    dst = np.array([0, 0, 1, 0, 1, 2, 0, 0, 0, 4, 5, 0, 1, 2, 3, 0, 2, 2, 0, 4,
        5, 0, 0, 3, 0, 1, 2, 3, 5, 6, 0, 1, 0, 1, 0, 1, 23, 24, 2, 23,
        24, 2, 23, 26, 1, 8, 0, 24, 25, 28, 2, 8, 14, 15, 18, 20, 22, 23,
        29, 30, 31, 8, 9, 13, 14, 15, 18, 19, 20, 22, 23, 26, 27, 28, 29, 30,
        31, 32])
    #Edges are directional in DGL; make them bidirectional
    u = np.concatenate([src, dst])
    v = np.concatenate([dst, src])
    #Building diagram
    return dgl.graph((u, v))


"""
logits: [ [[embeddings], [probability tensor]] x 200 epochs]
"""
#iterate through each epoch
def draw(iteration: int, all_logits: torch.tensor, nodelist: [int], nx_G, ax, coloring) -> None:
    """
    draws graphs from epoch iteration, given the node list of prelabeled nodes

    ARGS:
        all_logits: embeddings generated by model
        iteration: the iteration number for drawing
        nodelist: the prelabeled nodes from the training step
        nx_G: this is the graph
        colors: colors of each label
    
    """
    #save the embedding
    embed = {}
    colors = []

    if len(coloring) > 2:
        for j in range(nx_G.number_of_nodes()):
            cls = all_logits[iteration][j].numpy().argmax()
            colors.append(coloring[cls])

        pca_breast = PCA(n_components=2)
        final_data = pca_breast.fit_transform(all_logits[iteration].numpy())
        
        for j in range(nx_G.number_of_nodes()):
            embed[j] = final_data[j]

    else:
        for j in range(nx_G.number_of_nodes()):
            embed[j] = all_logits[iteration][j].numpy()
            cls = all_logits[iteration][j].numpy().argmax()
            colors.append(coloring[cls])

    #clear from previous graph
    ax.cla()

    #edit the plot
    ax.set_title('Epoch: %d'% iteration, fontdict = {"fontsize": 8})

    #draw the nodes classified from the ML
    nx.draw_networkx(nx_G.to_undirected(), embed, node_color=colors,
            with_labels=False, node_size=20, ax=ax, width = 0.03, edgecolors="grey")

    #draw the prelabeled nodes
    nx.draw_networkx_nodes(nx_G.to_undirected(), embed, nodelist = nodelist, node_color="black", node_size=50, ax=ax)

    ax.tick_params(left=False, bottom=False, labelleft=True, labelbottom=True) 
    ax.tick_params(axis='x', labelsize=7)
    ax.tick_params(axis='y', labelsize=7)
    #ax.legend()


def write_confusion_matrix(iteration: int, all_logits: torch.tensor, nx_G, true_labels: pd.DataFrame, classes: dict, ax, cbar_ax):
    pred_classes = []
    true_labels = true_labels["team"].tolist()

    #find class length for division of confusion matrix into percentage
    class_len = []
    for i in range(len(classes)):
        class_len.append(true_labels.count(classes[i]))

    #get what model thinks of classes
    #UNSURE IF THESE ARE IN ORDER etc. [node0: embed0, node1: embed1 ...] etc need to check
    for j in range(nx_G.number_of_nodes()):
        cls = all_logits[iteration][j].numpy().argmax()
        pred_classes.append(classes[cls])
        
    #pred classes = [predictions from the model]
    #true classes = [neo4j labels]
    #classes = {0: neo4j label, 1: neo4j label, 2: neo4j label...}


    confusion_matrix = {}

    #create confusion matrix
    """
    final confusion matrix        
                                predicted
                                A| B | C
                        true  x|
                              y|
                              z|
    
    currernt confusion matrix {A: [x, y, z], B: [x, y, z], C: [x, y, z]}
    """

    #iterate through the dict keys
    for i in range(len(classes)):
        
        #for confusion matrix dictionary
        column_list = []

        #predicted class
        #ex) for i = 1, this would be A = "UCLA ATHLETICS"
        label1 = classes[i]

        #iterate through the list of each dict key
        for j in range(len(classes)):

            #count occurrences of label 1 x label 1
            count = 0

            #for i = 1, this would be x = 'UCLA ATHLETICS"
            label2 = classes[j]

            #iterate through both at once
            for x, y in zip(true_labels, pred_classes):
                #label 1 is the predicted, label2 is the real
                if y == label1 and x == label2:
                    count += 1

            #finished counting, now append        
            column_list.append(count)   

        #after calculation, append to matrix
        confusion_matrix[classes[i]] = column_list


    confusion_matrix["class_len"] = class_len
    confusion_df = pd.DataFrame(confusion_matrix)

    #percentage
    confusion_df = confusion_df[confusion_df.columns[:-1]].div(confusion_df.class_len, axis=0)
    confusion_df.index = confusion_df.columns.to_list()

    ax.cla()

    #plot
    sns.heatmap(data = confusion_df, alpha=0.9, cmap = "magma", yticklabels = confusion_df.index, xticklabels = confusion_df.columns, annot = True, ax = ax, cbar = True, cbar_ax = cbar_ax, vmin = 0, vmax = 1)
    #edit the plot

    ax.set_title('Confusion Matrix for Epoch: %d'% iteration, fontdict = {"fontsize": 15})
    ax.tick_params(axis='x', labelsize=10)
    ax.tick_params(axis='y', labelsize=10)
    ax.set_xlabel("Predicted Labels", fontdict={'fontsize':12})
    ax.set_ylabel("True Labels", fontdict={'fontsize':12})

 
def hierarchical_clustering(iteration: int, all_logits: np.array) -> None:
    """ 
    ARGS:
        iterations: an int that specifies which iteration you want to cluster
        all_logits: numpy array(output tensor) that holds all embeddings from the model

    RETURNS:
        None!
        But draws a hierarchical_clustering plot
    """
    node_embeddings = all_logits[iteration]
    pass
    

if __name__ == "__main__":
    G = build_karate_club_graph()
    print('We have %d nodes.' % G.number_of_nodes())
    print('We have %d edges.' % G.number_of_edges())

    #Visualize the graph by converting it into a networkx graph
    nx_G = G.to_networkx().to_undirected()
    pos = nx.kamada_kawai_layout(nx_G)
    nx.draw(nx_G, pos, with_labels=True, node_color=[[.7, .7, .7]])
    plt.savefig('graph_vis/karate.png')