diff --git a/model.py b/model.py
index 1a0a046..3835a1f 100644
--- a/model.py
+++ b/model.py
@@ -19,671 +19,741 @@
 from collections import Counter
 
 class GCNModelAE(nn.Module):
-    def __init__(self, input_feat_dim, n_nodes, hidden_dim1, hidden_dim2, dropout,args):
-        super(GCNModelAE, self).__init__()
+	def __init__(self, input_feat_dim, n_nodes, hidden_dim1, hidden_dim2, dropout,args):
+		super(GCNModelAE, self).__init__()
 
-        self.args = args
-        self.gc1 = GraphConvolutionSparse(input_feat_dim, hidden_dim1, dropout, act=torch.relu)
-        self.gc2 = GraphConvolution(hidden_dim1, hidden_dim2, dropout, act=lambda x: x)
-        self.dc = InnerProductDecoder(dropout, act=lambda x: x)
-        # self.dc = InnerDecoder(dropout, act=lambda x: x)
+		self.args = args
+		self.gc1 = GraphConvolutionSparse(input_feat_dim, hidden_dim1, dropout, act=torch.relu)
+		self.gc2 = GraphConvolution(hidden_dim1, hidden_dim2, dropout, act=lambda x: x)
+		self.dc = InnerProductDecoder(dropout, act=lambda x: x)
+		# self.dc = InnerDecoder(dropout, act=lambda x: x)
 
-    def forward(self, x, adj):
-        z = self.gc1(x,adj)
-        z = self.gc2(z,adj)
-        return self.dc(z),z,None
+	def forward(self, x, adj):
+		z = self.gc1(x,adj)
+		z = self.gc2(z,adj)
+		return self.dc(z),z,None
 
 
-    def loss(self,pred_adj,labels, n_nodes, n_features, norm, pos_weight,L=1):
+	def loss(self,pred_adj,labels, n_nodes, n_features, norm, pos_weight,L=1):
 
-        cost = norm * F.binary_cross_entropy_with_logits(pred_adj, labels,pos_weight = pos_weight)
-        return cost,
+		cost = norm * F.binary_cross_entropy_with_logits(pred_adj, labels,pos_weight = pos_weight)
+		return cost,
 
-    def check_parameters(self):
-        for name, param in self.named_parameters():
-            if param.requires_grad:
-                print(name, param.data,param.data.shape)
+	def check_parameters(self):
+		for name, param in self.named_parameters():
+			if param.requires_grad:
+				print(name, param.data,param.data.shape)
 
 class GCNModelVAE(nn.Module):
-    def __init__(self, input_feat_dim, n_nodes, hidden_dim1, hidden_dim2, dropout,args):
-        super(GCNModelVAE, self).__init__()
+	def __init__(self, input_feat_dim, n_nodes, hidden_dim1, hidden_dim2, dropout,args):
+		super(GCNModelVAE, self).__init__()
 
-        self.args = args
-        self.gc1 = GraphConvolutionSparse(input_feat_dim, hidden_dim1, dropout, act=torch.relu)
-        self.gc2 = GraphConvolution(hidden_dim1, hidden_dim2, dropout, act=lambda x: x)
-        self.gc3 = GraphConvolution(hidden_dim1, hidden_dim2, dropout, act=lambda x: x)
-        self.dc = InnerProductDecoder(dropout, act=lambda x: x)
-        # self.dc = InnerDecoder(dropout, act=lambda x: x)
+		self.args = args
+		self.gc1 = GraphConvolutionSparse(input_feat_dim, hidden_dim1, dropout, act=torch.relu)
+		self.gc2 = GraphConvolution(hidden_dim1, hidden_dim2, dropout, act=lambda x: x)
+		self.gc3 = GraphConvolution(hidden_dim1, hidden_dim2, dropout, act=lambda x: x)
+		self.dc = InnerProductDecoder(dropout, act=lambda x: x)
+		# self.dc = InnerDecoder(dropout, act=lambda x: x)
 
 
-    def encoder(self, x, adj):
-        hidden1 = self.gc1(x, adj)
-        return self.gc2(hidden1, adj), self.gc3(hidden1, adj)
+	def encoder(self, x, adj):
+		hidden1 = self.gc1(x, adj)
+		return self.gc2(hidden1, adj), self.gc3(hidden1, adj)
 
-    def decoder(self,mu,logvar):
+	def decoder(self,mu,logvar):
 
-        z_u = self.reparameterize(mu, logvar)
+		z_u = self.reparameterize(mu, logvar)
 
-        return self.dc(z_u)
+		return self.dc(z_u)
 
-    def reparameterize(self, mu, logvar):
-        std = torch.exp(logvar)
-        eps = torch.randn_like(std)
-        return eps.mul(std).add_(mu)
+	def reparameterize(self, mu, logvar):
+		std = torch.exp(logvar)
+		eps = torch.randn_like(std)
+		return eps.mul(std).add_(mu)
 
-        # if self.training:
-            # std = torch.exp(logvar)
-            # eps = torch.randn_like(std)
-            # return eps.mul(std).add_(mu)
-        # else:
-            # return mu
+		# if self.training:
+			# std = torch.exp(logvar)
+			# eps = torch.randn_like(std)
+			# return eps.mul(std).add_(mu)
+		# else:
+			# return mu
 
-    def forward(self, x, adj):
+	def forward(self, x, adj):
 
-        mu, logvar = self.encoder(x, adj)
-        z_u = self.reparameterize(mu, logvar)
-        # z_a = self.reparameterize(mu_a,logvar_a)
-        return self.dc(z_u),mu, logvar
+		mu, logvar = self.encoder(x, adj)
+		z_u = self.reparameterize(mu, logvar)
+		# z_a = self.reparameterize(mu_a,logvar_a)
+		return self.dc(z_u),mu, logvar
 
 
-    def loss(self,x,adj,labels, n_nodes, n_features, norm, pos_weight,L=1):
+	def loss(self,x,adj,labels, n_nodes, n_features, norm, pos_weight,L=1):
 
-        det=1e-10
-        norm_u = norm
-        pos_weight_u= pos_weight
+		det=1e-10
+		norm_u = norm
+		pos_weight_u= pos_weight
 
-        L_rec_u=0
+		L_rec_u=0
 
-        mu, logvar = self.encoder(x, adj)
-        # z_mu, z_sigma2_log = self.encoder(x)
-        for l in range(L):
+		mu, logvar = self.encoder(x, adj)
+		# z_mu, z_sigma2_log = self.encoder(x)
+		for l in range(L):
 
-            pred_adj = self.decoder(mu,logvar)
+			pred_adj = self.decoder(mu,logvar)
 
-            cost_u = norm * F.binary_cross_entropy_with_logits(pred_adj, labels ,pos_weight = pos_weight)
+			cost_u = norm * F.binary_cross_entropy_with_logits(pred_adj, labels ,pos_weight = pos_weight)
 
-            L_rec_u += cost_u
+			L_rec_u += cost_u
 
-        L_rec_u/=L
+		L_rec_u/=L
 
-        KLD = -0.5 / n_nodes * torch.mean(torch.sum(1 + 2 * logvar - mu.pow(2) - logvar.exp().pow(2),1))
-        return L_rec_u, KLD
+		KLD = -0.5 / n_nodes * torch.mean(torch.sum(1 + 2 * logvar - mu.pow(2) - logvar.exp().pow(2),1))
+		return L_rec_u, KLD
 
 
-    def check_parameters(self):
-        for name, param in self.named_parameters():
-            if param.requires_grad:
-                print(name, param.data,param.data.shape)
+	def check_parameters(self):
+		for name, param in self.named_parameters():
+			if param.requires_grad:
+				print(name, param.data,param.data.shape)
 
 
 class GCNModelVAECD(nn.Module):
-    def __init__(self, input_feat_dim, n_nodes, hidden_dim1, hidden_dim2, dropout,args):
-        super(GCNModelVAECD, self).__init__()
+	def __init__(self, input_feat_dim, n_nodes, hidden_dim1, hidden_dim2, dropout,args):
+		super(GCNModelVAECD, self).__init__()
 
-        self.args = args
-        self.gc1 = GraphConvolutionSparse(input_feat_dim, hidden_dim1, dropout, act=torch.relu)
-        self.gc2 = GraphConvolution(hidden_dim1, hidden_dim2, dropout, act=lambda x: x)
-        self.gc3 = GraphConvolution(hidden_dim1, hidden_dim2, dropout, act=lambda x: x)
-        self.dc = InnerProductDecoder(dropout, act=lambda x: x)
-        # self.dc = InnerDecoder(dropout, act=lambda x: x)
+		self.args = args
+		self.gc1 = GraphConvolutionSparse(input_feat_dim, hidden_dim1, dropout, act=torch.relu)
+		self.gc2 = GraphConvolution(hidden_dim1, hidden_dim2, dropout, act=lambda x: x)
+		self.gc3 = GraphConvolution(hidden_dim1, hidden_dim2, dropout, act=lambda x: x)
+		self.dc = InnerProductDecoder(dropout, act=lambda x: x)
+		# self.dc = InnerDecoder(dropout, act=lambda x: x)
 
-        #for embedding attributes/features
-        # self.linear_a1= Linear(n_nodes, hidden_dim1, act = torch.tanh,sparse_inputs=True) # the input dim is the number of nodes
-        # self.linear_a2= Linear(hidden_dim1, hidden_dim2, act = lambda x:x)
-        # self.linear_a3= Linear(hidden_dim1, hidden_dim2, act = lambda x:x)
+		#for embedding attributes/features
+		# self.linear_a1= Linear(n_nodes, hidden_dim1, act = torch.tanh,sparse_inputs=True) # the input dim is the number of nodes
+		# self.linear_a2= Linear(hidden_dim1, hidden_dim2, act = lambda x:x)
+		# self.linear_a3= Linear(hidden_dim1, hidden_dim2, act = lambda x:x)
 
 
-        self.pi_=nn.Parameter(torch.FloatTensor(args.nClusters,).fill_(1)/args.nClusters,requires_grad=True)
-        self.mu_c=nn.Parameter(torch.randn(args.nClusters,hidden_dim2),requires_grad=True)
-        self.log_sigma2_c=nn.Parameter(torch.randn(args.nClusters,hidden_dim2),requires_grad=True)
+		self.pi_=nn.Parameter(torch.FloatTensor(args.nClusters,).fill_(1)/args.nClusters,requires_grad=True)
+		self.mu_c=nn.Parameter(torch.randn(args.nClusters,hidden_dim2),requires_grad=True)
+		self.log_sigma2_c=nn.Parameter(torch.randn(args.nClusters,hidden_dim2),requires_grad=True)
 
-    def encoder(self, x, adj):
-        hidden1 = self.gc1(x, adj)
-        # hidden_a1 = self.linear_a1(x.t()) # transpose the input feature matrix
+	def encoder(self, x, adj):
+		hidden1 = self.gc1(x, adj)
+		# hidden_a1 = self.linear_a1(x.t()) # transpose the input feature matrix
 
-        return self.gc2(hidden1, adj), self.gc3(hidden1, adj)
+		return self.gc2(hidden1, adj), self.gc3(hidden1, adj)
 
-    def decoder(self,mu,logvar):
+	def decoder(self,mu,logvar):
 
-        z_u = self.reparameterize(mu, logvar)
-        # z_a = self.reparameterize(mu_a,logvar_a)
+		z_u = self.reparameterize(mu, logvar)
+		# z_a = self.reparameterize(mu_a,logvar_a)
 
-        return self.dc(z_u)
+		return self.dc(z_u)
 
-    def reparameterize(self, mu, logvar):
-        if self.training:
-            std = torch.exp(logvar)
-            eps = torch.randn_like(std)
-            return eps.mul(std).add_(mu)
-        else:
-            return mu
+	def reparameterize(self, mu, logvar):
+		if self.training:
+			std = torch.exp(logvar)
+			eps = torch.randn_like(std)
+			return eps.mul(std).add_(mu)
+		else:
+			return mu
 
-    def forward(self, x, adj):
+	def forward(self, x, adj):
 
-        mu, logvar = self.encoder(x, adj)
-        z_u = self.reparameterize(mu, logvar)
-        # z_a = self.reparameterize(mu_a,logvar_a)
-        return self.dc(z_u),mu, logvar
+		mu, logvar = self.encoder(x, adj)
+		z_u = self.reparameterize(mu, logvar)
+		# z_a = self.reparameterize(mu_a,logvar_a)
+		return self.dc(z_u),mu, logvar
 
 
-    def loss(self,x,adj,labels, n_nodes, n_features, norm, pos_weight,L=1):
+	def loss(self,x,adj,labels, n_nodes, n_features, norm, pos_weight,L=1):
 
-        det=1e-10
-        norm_u = norm
-        pos_weight_u= pos_weight
+		det=1e-10
+		norm_u = norm
+		pos_weight_u= pos_weight
 
-        L_rec_u=0
+		L_rec_u=0
 
-        mu, logvar = self.encoder(x, adj)
-        hidden_dim2 = mu.shape[1]
+		mu, logvar = self.encoder(x, adj)
+		hidden_dim2 = mu.shape[1]
 
-        # z_mu, z_sigma2_log = self.encoder(x)
-        for l in range(L):
+		# z_mu, z_sigma2_log = self.encoder(x)
+		for l in range(L):
 
-            # z=torch.randn_like(z_mu)*torch.exp(z_sigma2_log/2)+z_mu
-            pred_adj = self.decoder(mu,logvar)
-            # L_rec+=F.binary_cross_entropy(x_pro,x)
+			# z=torch.randn_like(z_mu)*torch.exp(z_sigma2_log/2)+z_mu
+			pred_adj = self.decoder(mu,logvar)
+			# L_rec+=F.binary_cross_entropy(x_pro,x)
 
-            # cost_u = norm * F.binary_cross_entropy_with_logits(pred_adj, labels_sub_u,pos_weight = pos_weight)
-            cost_u = norm * F.binary_cross_entropy_with_logits(pred_adj, labels ,pos_weight = pos_weight)
-            # cost_a = norm_a * F.binary_cross_entropy_with_logits(pred_x, labels_sub_a, pos_weight = pos_weight_a)
-            # cost_a =torch.Tensor(1).fill_(0)
+			# cost_u = norm * F.binary_cross_entropy_with_logits(pred_adj, labels_sub_u,pos_weight = pos_weight)
+			cost_u = norm * F.binary_cross_entropy_with_logits(pred_adj, labels ,pos_weight = pos_weight)
+			# cost_a = norm_a * F.binary_cross_entropy_with_logits(pred_x, labels_sub_a, pos_weight = pos_weight_a)
+			# cost_a =torch.Tensor(1).fill_(0)
 
-            L_rec_u += cost_u
-            # L_rec_a += cost_a
+			L_rec_u += cost_u
+			# L_rec_a += cost_a
 
-        L_rec_u/=L
-        # L_rec_a/=L
+		L_rec_u/=L
+		# L_rec_a/=L
 
-        # z_a = self.reparameterize(mu_a,logvar_a)
-        # KLD_a = (0.5 / n_features) * torch.mean(torch.sum(-1 - 2 * logvar_a + mu_a.pow(2) + logvar_a.exp().pow(2), 1))
-        # KLD_a =torch.Tensor(1).fill_(0)
+		# z_a = self.reparameterize(mu_a,logvar_a)
+		# KLD_a = (0.5 / n_features) * torch.mean(torch.sum(-1 - 2 * logvar_a + mu_a.pow(2) + logvar_a.exp().pow(2), 1))
+		# KLD_a =torch.Tensor(1).fill_(0)
 
-        # Loss=L_rec*x.size(1)
+		# Loss=L_rec*x.size(1)
 
 
-        self.pi_.data = (self.pi_/self.pi_.sum()).data
-        # log_sigma2_c=self.log_sigma2_c
-        # mu_c=self.mu_c
+		self.pi_.data = (self.pi_/self.pi_.sum()).data
+		# log_sigma2_c=self.log_sigma2_c
+		# mu_c=self.mu_c
 
-        # z = torch.randn_like(z_mu) * torch.exp(z_sigma2_log / 2) + z_mu
-        z = self.reparameterize(mu,logvar)
+		# z = torch.randn_like(z_mu) * torch.exp(z_sigma2_log / 2) + z_mu
+		z = self.reparameterize(mu,logvar)
 
-        gamma_c=torch.exp(torch.log(self.pi_.unsqueeze(0))+self.gaussian_pdfs_log(z,self.mu_c,self.log_sigma2_c))+det
-        gamma_c = F.softmax(gamma_c) # is softmax a good way?
+		gamma_c=torch.exp(torch.log(self.pi_.unsqueeze(0))+self.gaussian_pdfs_log(z,self.mu_c,self.log_sigma2_c))+det
+		gamma_c = F.softmax(gamma_c) # is softmax a good way?
 
-        gamma_c=gamma_c/(gamma_c.sum(1).view(-1,1)) #shape: batch_size*Clusters
-        self.pi_.data = gamma_c.mean(0).data # prior need to be re-normalized? In GMM, prior is based on gamma_c:https://brilliant.org/wiki/gaussian-mixture-model/
+		gamma_c=gamma_c/(gamma_c.sum(1).view(-1,1)) #shape: batch_size*Clusters
+		self.pi_.data = gamma_c.mean(0).data # prior need to be re-normalized? In GMM, prior is based on gamma_c:https://brilliant.org/wiki/gaussian-mixture-model/
 
-        # KLD_u_c=(0.5 / n_nodes)*torch.mean(torch.sum(gamma_c*torch.sum(self.log_sigma2_c.unsqueeze(0)+\
-            # torch.exp(2*logvar.unsqueeze(1)-self.log_sigma2_c.unsqueeze(0))+\
-            # (mu.unsqueeze(1)-self.mu_c.unsqueeze(0)).pow(2)/torch.exp(self.log_sigma2_c.unsqueeze(0)),2),1))
+		# KLD_u_c=(0.5 / n_nodes)*torch.mean(torch.sum(gamma_c*torch.sum(self.log_sigma2_c.unsqueeze(0)+\
+			# torch.exp(2*logvar.unsqueeze(1)-self.log_sigma2_c.unsqueeze(0))+\
+			# (mu.unsqueeze(1)-self.mu_c.unsqueeze(0)).pow(2)/torch.exp(self.log_sigma2_c.unsqueeze(0)),2),1))
 
-        # KLD_u_c-= (0.5/n_nodes)*torch.mean(torch.sum(1+2*logvar,1))
-        # gamma_loss = (1 / self.args.nClusters) * torch.mean(torch.sum(gamma_c*torch.log(gamma_c/self.pi_.unsqueeze(0)),1)) - (0.5 / hidden_dim2)*torch.mean(torch.sum(1+2*logvar,1))
+		# KLD_u_c-= (0.5/n_nodes)*torch.mean(torch.sum(1+2*logvar,1))
+		# gamma_loss = (1 / self.args.nClusters) * torch.mean(torch.sum(gamma_c*torch.log(gamma_c/self.pi_.unsqueeze(0)),1)) - (0.5 / hidden_dim2)*torch.mean(torch.sum(1+2*logvar,1))
 
-        KLD_u_c=-(0.5/n_nodes)*torch.mean(torch.sum(gamma_c*torch.sum(-1+self.log_sigma2_c.unsqueeze(0)-2*logvar.unsqueeze(1)+
-            torch.exp(2*logvar.unsqueeze(1)-self.log_sigma2_c.unsqueeze(0))+
-            (mu.unsqueeze(1)-self.mu_c.unsqueeze(0)).pow(2)/torch.exp(self.log_sigma2_c.unsqueeze(0)),2),1))
+		KLD_u_c=-(0.5/n_nodes)*torch.mean(torch.sum(gamma_c*torch.sum(-1+self.log_sigma2_c.unsqueeze(0)-2*logvar.unsqueeze(1)+
+			torch.exp(2*logvar.unsqueeze(1)-self.log_sigma2_c.unsqueeze(0))+
+			(mu.unsqueeze(1)-self.mu_c.unsqueeze(0)).pow(2)/torch.exp(self.log_sigma2_c.unsqueeze(0)),2),1))
 
-        gamma_loss = -(1 / self.args.nClusters) * torch.mean(torch.sum(gamma_c*torch.log(gamma_c/self.pi_.unsqueeze(0)),1))
+		gamma_loss = -(1 / self.args.nClusters) * torch.mean(torch.sum(gamma_c*torch.log(gamma_c/self.pi_.unsqueeze(0)),1))
 
-        return L_rec_u,-KLD_u_c,-gamma_loss
+		return L_rec_u,-KLD_u_c,-gamma_loss
 
-    def pre_train(self,x,adj,Y,pre_epoch=50):
-        '''
-        This function is used to initialize  cluster paramters: pi_, mu_c, log_sigma2_c.
-        -------------
-        paramters:
-        x: is the feature matrix of graph G.
-        adj: is the adjacent matrix of graph G.
-        Y: is the class label for each node in graph G.
-        '''
+	def pre_train(self,x,adj,Y,pre_epoch=50):
+		'''
+		This function is used to initialize  cluster paramters: pi_, mu_c, log_sigma2_c.
+		-------------
+		paramters:
+		x: is the feature matrix of graph G.
+		adj: is the adjacent matrix of graph G.
+		Y: is the class label for each node in graph G.
+		'''
 
-        if  not os.path.exists('./pretrain_model_{}.pk'.format(self.args.dataset)):
+		if	not os.path.exists('./pretrain_model_{}.pk'.format(self.args.dataset)):
 
-            Loss=nn.MSELoss()
-            opti=Adam(self.parameters()) #all paramters in model
+			Loss=nn.MSELoss()
+			opti=Adam(self.parameters()) #all paramters in model
 
-            print('Pretraining......')
-            # epoch_bar=tqdm(range(pre_epoch))
-            # for _ in epoch_bar:
-            for _ in range(pre_epoch):
+			print('Pretraining......')
+			# epoch_bar=tqdm(range(pre_epoch))
+			# for _ in epoch_bar:
+			for _ in range(pre_epoch):
 
-                self.train()
-                L=0
-                mu, logvar  = self.encoder(x,adj)
-                pred_adj = self.decoder(mu,logvar)
+				self.train()
+				L=0
+				mu, logvar	= self.encoder(x,adj)
+				pred_adj = self.decoder(mu,logvar)
 
-                loss=  Loss(pred_adj,adj.to_dense())
+				loss=  Loss(pred_adj,adj.to_dense())
 
-                L+=loss.detach().cpu().numpy()
+				L+=loss.detach().cpu().numpy()
 
-                opti.zero_grad()
-                loss.backward()
-                opti.step()
+				opti.zero_grad()
+				loss.backward()
+				opti.step()
 
-                # epoch_bar.write('L2={:.4f}'.format(L))
-                print('L2={:.4f}'.format(L))
+				# epoch_bar.write('L2={:.4f}'.format(L))
+				print('L2={:.4f}'.format(L))
 
-            self.gc2.load_state_dict(self.gc3.state_dict())
-            # self.linear_a2.load_state_dict(self.linear_a3.state_dict())
+			self.gc2.load_state_dict(self.gc3.state_dict())
+			# self.linear_a2.load_state_dict(self.linear_a3.state_dict())
 
-            with torch.no_grad():
-                mu, logvar  = self.encoder(x,adj)
-                assert F.mse_loss(mu, logvar) == 0
-                # assert F.mse_loss(mu_a, logvar_a) == 0
-                Z = mu.data.numpy()
+			with torch.no_grad():
+				mu, logvar	= self.encoder(x,adj)
+				assert F.mse_loss(mu, logvar) == 0
+				# assert F.mse_loss(mu_a, logvar_a) == 0
+				Z = mu.data.numpy()
 
 
-            gmm = GaussianMixture(n_components=self.args.nClusters, covariance_type='diag')
+			gmm = GaussianMixture(n_components=self.args.nClusters, covariance_type='diag')
 
-            pre = gmm.fit_predict(Z)
-            print('Acc={:.4f}%'.format(cluster_acc(pre, Y)[0] * 100))
+			pre = gmm.fit_predict(Z)
+			print('Acc={:.4f}%'.format(cluster_acc(pre, Y)[0] * 100))
 
-            self.pi_.data = torch.from_numpy(gmm.weights_).float()
-            self.mu_c.data = torch.from_numpy(gmm.means_).float()
-            self.log_sigma2_c.data = torch.log(torch.from_numpy(gmm.covariances_).float())
+			self.pi_.data = torch.from_numpy(gmm.weights_).float()
+			self.mu_c.data = torch.from_numpy(gmm.means_).float()
+			self.log_sigma2_c.data = torch.log(torch.from_numpy(gmm.covariances_).float())
 
-            torch.save(self.state_dict(), './pretrain_model_{}.pk'.format(self.args.dataset))
-        else:
-            self.load_state_dict(torch.load('./pretrain_model_{}.pk'.format(self.args.dataset)))
+			torch.save(self.state_dict(), './pretrain_model_{}.pk'.format(self.args.dataset))
+		else:
+			self.load_state_dict(torch.load('./pretrain_model_{}.pk'.format(self.args.dataset)))
 
-    def predict(self,mu, logvar):
-        # z_mu, z_sigma2_log, z_ma,z_a_sigma2_log = self.encoder(x,adj)
-        # mu, logvar, mu_a, logvar_a  = self.encoder(x,adj)
-        # z = torch.randn_like(mu) * torch.exp(logvar) + mu
-        z  = self.reparameterize(mu,logvar)
-        pi = self.pi_
-        log_sigma2_c = self.log_sigma2_c
-        mu_c = self.mu_c
-        gamma_c = torch.exp(torch.log(pi.unsqueeze(0))+self.gaussian_pdfs_log(z,mu_c,log_sigma2_c))
+	def predict(self,mu, logvar):
+		# z_mu, z_sigma2_log, z_ma,z_a_sigma2_log = self.encoder(x,adj)
+		# mu, logvar, mu_a, logvar_a  = self.encoder(x,adj)
+		# z = torch.randn_like(mu) * torch.exp(logvar) + mu
+		z  = self.reparameterize(mu,logvar)
+		pi = self.pi_
+		log_sigma2_c = self.log_sigma2_c
+		mu_c = self.mu_c
+		gamma_c = torch.exp(torch.log(pi.unsqueeze(0))+self.gaussian_pdfs_log(z,mu_c,log_sigma2_c))
 
-        gamma=gamma_c.detach().cpu().numpy()
+		gamma=gamma_c.detach().cpu().numpy()
 
-        return np.argmax(gamma,axis=1),gamma
+		return np.argmax(gamma,axis=1),gamma
 
 
-    def gaussian_pdfs_log(self,x,mus,log_sigma2s):
-        G=[]
-        for c in range(self.args.nClusters):
-            G.append(self.gaussian_pdf_log(x,mus[c:c+1,:],log_sigma2s[c:c+1,:]).view(-1,1))
-        return torch.cat(G,1)
+	def gaussian_pdfs_log(self,x,mus,log_sigma2s):
+		G=[]
+		for c in range(self.args.nClusters):
+			G.append(self.gaussian_pdf_log(x,mus[c:c+1,:],log_sigma2s[c:c+1,:]).view(-1,1))
+		return torch.cat(G,1)
 
 
-    @staticmethod
-    def gaussian_pdf_log(x,mu,log_sigma2):
-        return -0.5*(torch.sum(np.log(np.pi*2)+log_sigma2+(x-mu).pow(2)/torch.exp(log_sigma2),1))
+	@staticmethod
+	def gaussian_pdf_log(x,mu,log_sigma2):
+		return -0.5*(torch.sum(np.log(np.pi*2)+log_sigma2+(x-mu).pow(2)/torch.exp(log_sigma2),1))
 
-    def check_parameters(self):
-        for name, param in self.named_parameters():
-            if param.requires_grad:
-                print(name, param.data,param.data.shape)
+	def check_parameters(self):
+		for name, param in self.named_parameters():
+			if param.requires_grad:
+				print(name, param.data,param.data.shape)
 
 class GCNModelVAECE(nn.Module):
-    def __init__(self, input_feat_dim, n_nodes, hidden_dim1, hidden_dim2, dropout,args):
-        super(GCNModelVAECE, self).__init__()
+	def __init__(self, input_feat_dim, n_nodes, hidden_dim1, hidden_dim2, dropout,args):
+		super(GCNModelVAECE, self).__init__()
+
+
+		self.args = args
+		self.gc1 = GraphConvolutionSparse(input_feat_dim, hidden_dim1, dropout, act=torch.relu)
+		self.gc2 = GraphConvolution(hidden_dim1, hidden_dim2, dropout, act=lambda x: x)
+		self.gc3 = GraphConvolution(hidden_dim1, hidden_dim2, dropout, act=lambda x: x)
+		# self.dc = InnerProductDecoder(dropout, act=lambda x: x)
+		self.dc = InnerDecoder(dropout, act=lambda x: x)
 
+		#for embedding attributes/features
+		self.linear_a1= Linear(n_nodes, hidden_dim1, act = torch.tanh,sparse_inputs=True) # the input dim is the number of nodes
+		self.linear_a2= Linear(hidden_dim1, hidden_dim2, act = lambda x:x)
+		self.linear_a3= Linear(hidden_dim1, hidden_dim2, act = lambda x:x)
+
+		#modularity layer
+		# self.modulairty_layer = Linear(hidden_dim2,args.nClusters,act=torch.relu)
+		# self.cluster_choose= Linear(hidden_dim2,args.nClusters,act=torch.relu)
+
+
+		self.pi_=nn.Parameter(torch.FloatTensor(args.nClusters,).fill_(1)/args.nClusters,requires_grad=True)
+		self.mu_c=nn.Parameter(torch.FloatTensor(args.nClusters,hidden_dim2).fill_(0.00),requires_grad=True)
+		self.log_sigma2_c=nn.Parameter(torch.FloatTensor(args.nClusters,hidden_dim2).fill_(0.0),requires_grad=False)
 
-        self.args = args
-        self.gc1 = GraphConvolutionSparse(input_feat_dim, hidden_dim1, dropout, act=torch.relu)
-        self.gc2 = GraphConvolution(hidden_dim1, hidden_dim2, dropout, act=lambda x: x)
-        self.gc3 = GraphConvolution(hidden_dim1, hidden_dim2, dropout, act=lambda x: x)
-        # self.dc = InnerProductDecoder(dropout, act=lambda x: x)
-        self.dc = InnerDecoder(dropout, act=lambda x: x)
+		torch.nn.init.xavier_normal_(self.mu_c)
+		# torch.nn.init.xavier_normal_(self.log_sigma2_c)
 
-        #for embedding attributes/features
-        self.linear_a1= Linear(n_nodes, hidden_dim1, act = torch.tanh,sparse_inputs=True) # the input dim is the number of nodes
-        self.linear_a2= Linear(hidden_dim1, hidden_dim2, act = lambda x:x)
-        self.linear_a3= Linear(hidden_dim1, hidden_dim2, act = lambda x:x)
+		# calculate mi
 
-        #modularity layer
-        self.modulairty_layer = Linear(hidden_dim2,args.nClusters,act=torch.relu)
+		# critic_params = {'dim_x': x.shape[1],'dim_y':y.shape[1],'layers': 2,'embed_dim': 32,'hidden_dim': 64,'activation': 'relu',}
+		# self.critic_structure = ConcatCritic(hidden_dim2,n_nodes,256,3,'relu',rho=None,)
+		# self.critic_feature = ConcatCritic(hidden_dim2,input_feat_dim,256,3,'relu',rho=None,)
 
+	def encoder(self, x, adj):
+		hidden1 = self.gc1(x, adj)
+		hidden_a1 = self.linear_a1(x.t()) # transpose the input feature matrix
+		return self.gc2(hidden1, adj), self.gc3(hidden1, adj), self.linear_a2(hidden_a1),self.linear_a3(hidden_a1)
 
-        self.pi_=nn.Parameter(torch.FloatTensor(args.nClusters,).fill_(1)/args.nClusters,requires_grad=True)
-        self.mu_c=nn.Parameter(torch.FloatTensor(args.nClusters,hidden_dim2).fill_(0.00),requires_grad=True)
-        self.log_sigma2_c=nn.Parameter(torch.FloatTensor(args.nClusters,hidden_dim2).fill_(0.0),requires_grad=False)
-
-        torch.nn.init.xavier_normal_(self.mu_c)
-        # torch.nn.init.xavier_normal_(self.log_sigma2_c)
+	def decoder(self,mu,mu_a,logvar,logvar_a):
 
-        # calculate mi
+		z_u = self.reparameterize(mu, logvar)
+		z_a = self.reparameterize(mu_a,logvar_a)
+		return self.dc((z_u,z_a))
 
-        # critic_params = {'dim_x': x.shape[1],'dim_y':y.shape[1],'layers': 2,'embed_dim': 32,'hidden_dim': 64,'activation': 'relu',}
-        # self.critic_structure = ConcatCritic(hidden_dim2,n_nodes,256,3,'relu',rho=None,)
-        # self.critic_feature = ConcatCritic(hidden_dim2,input_feat_dim,256,3,'relu',rho=None,)
+	def reparameterize(self, mu, logvar):
+		if self.training:
+			std = torch.exp(logvar)
+			eps = torch.randn_like(std)
+			return eps.mul(std).add_(mu)
+		else:
+			return mu
 
-    def encoder(self, x, adj):
-        hidden1 = self.gc1(x, adj)
-        hidden_a1 = self.linear_a1(x.t()) # transpose the input feature matrix
-        return self.gc2(hidden1, adj), self.gc3(hidden1, adj), self.linear_a2(hidden_a1),self.linear_a3(hidden_a1)
+	def forward(self, x, adj):
 
-    def decoder(self,mu,mu_a,logvar,logvar_a):
+		mu, logvar, mu_a, logvar_a = self.encoder(x, adj)
+		z_u = self.reparameterize(mu, logvar)
+		z_a = self.reparameterize(mu_a,logvar_a)
+		return self.dc((z_u,z_a)),mu, logvar, mu_a, logvar_a
 
-        z_u = self.reparameterize(mu, logvar)
-        z_a = self.reparameterize(mu_a,logvar_a)
-        return self.dc((z_u,z_a))
+	def modularity_loss(self, z,adj):
 
-    def reparameterize(self, mu, logvar):
-        if self.training:
-            std = torch.exp(logvar)
-            eps = torch.randn_like(std)
-            return eps.mul(std).add_(mu)
-        else:
-            return mu
+		adj = adj.to_dense()
+		H = self.modulairty_layer(z)
+		assert H.shape[0]==z.shape[0]
 
-    def forward(self, x, adj):
+		n = torch.tensor(1.0*z.shape[0])
 
-        mu, logvar, mu_a, logvar_a = self.encoder(x, adj)
-        z_u = self.reparameterize(mu, logvar)
-        z_a = self.reparameterize(mu_a,logvar_a)
-        return self.dc((z_u,z_a)),mu, logvar, mu_a, logvar_a
+		H_norm = n.sqrt()*H.sqrt()/(H.sqrt().sum())
+		print("H_norm shape",H_norm.shape)
+		print("H_norm ",H_norm)
+		m = (adj-torch.eye(adj.shape[0])).sum()/2
+		D = (adj-torch.eye(adj.shape[0])).sum(1) # the degree of nodes, adj includes self loop
+		B = (adj-torch.eye(adj.shape[0]))-torch.matmul(D.view(-1,1),D.view(1,-1))/(2*m) # modularity matrix
+		mod_loss=torch.trace(torch.matmul(torch.matmul(H_norm.t(),B),H_norm)/(4*m))
+		print("mod_loss",mod_loss)
 
-    def modularity_loss(self, z,adj):
+		return mod_loss
 
-        adj = adj.to_dense()
-        H = self.modulairty_layer(z)
-        assert H.shape[0]==z.shape[0]
+	def dist(self,x):
+		# x = x/torch.norm(x,2,dim=1).view(-1,1)
+		assert len(x.size()) == 2
+		norm = (x ** 2).sum(1).view(-1, 1)
+		dn = (norm + norm.view(1, -1)) - 2.0 * (x @ x.t())
+		return torch.sum(torch.relu(dn).sqrt())
 
-        n = torch.tensor(1.0*z.shape[0])
+	def mi_loss(self,z,x,a):
+		# critic_params = {'dim_x': x.shape[1],'dim_y':y.shape[1],'layers': 2,'embed_dim': 32,'hidden_dim': 64,'activation': 'relu',}
+		# critic = ConcatCritic(rho=None,**critic_params)
+		indice = torch.randperm(len(z))[0:50]
+		# mi_x = estimate_mutual_information('dv',z[indice],x[indice],self.critic_structure)
+		mi_a = estimate_mutual_information('js',z[indice],a[indice],self.critic_feature)
+		return mi_a
 
-        H_norm = n.sqrt()*H.sqrt()/(H.sqrt().sum())
-        print("H_norm shape",H_norm.shape)
-        print("H_norm ",H_norm)
-        m = (adj-torch.eye(adj.shape[0])).sum()/2
-        D = (adj-torch.eye(adj.shape[0])).sum(1) # the degree of nodes, adj includes self loop
-        B = (adj-torch.eye(adj.shape[0]))-torch.matmul(D.view(-1,1),D.view(1,-1))/(2*m) # modularity matrix
-        mod_loss=torch.trace(torch.matmul(torch.matmul(H_norm.t(),B),H_norm)/(4*m))
-        print("mod_loss",mod_loss)
+	def change_cluster_grad_false(self):
+		for name, param in self.named_parameters():
+			if name in ['pi_','mu_c','log_sigma2_c']:
+				param.requires_grad=False
 
-        return mod_loss
+	def change_cluster_grad_true(self):
+		for name, param in self.named_parameters():
+			if name in ['pi_','mu_c','log_sigma2_c']:
+				param.requires_grad=True
 
-    def dist(self,x):
-        # x = x/torch.norm(x,2,dim=1).view(-1,1)
-        assert len(x.size()) == 2
-        norm = (x ** 2).sum(1).view(-1, 1)
-        dn = (norm + norm.view(1, -1)) - 2.0 * (x @ x.t())
-        return torch.sum(torch.relu(dn).sqrt())
 
-    def mi_loss(self,z,x,a):
-        # critic_params = {'dim_x': x.shape[1],'dim_y':y.shape[1],'layers': 2,'embed_dim': 32,'hidden_dim': 64,'activation': 'relu',}
-        # critic = ConcatCritic(rho=None,**critic_params)
-        indice = torch.randperm(len(z))[0:50]
-        # mi_x = estimate_mutual_information('dv',z[indice],x[indice],self.critic_structure)
-        mi_a = estimate_mutual_information('js',z[indice],a[indice],self.critic_feature)
-        return mi_a
+	def change_nn_grad_false(self):
+		for name, param in self.named_parameters():
+			if name not in ['pi_','mu_c','log_sigma2_c']:
+				param.requires_grad=False
 
-    def change_cluster_grad_false(self):
-        for name, param in self.named_parameters():
-            if name in ['pi_','mu_c','log_sigma2_c']:
-                param.requires_grad=False
+	def change_nn_grad_true(self):
+		for name, param in self.named_parameters():
+			if name not in ['pi_','mu_c','log_sigma2_c']:
+				param.requires_grad=True
 
-    def change_cluster_grad_true(self):
-        for name, param in self.named_parameters():
-            if name in ['pi_','mu_c','log_sigma2_c']:
-                param.requires_grad=True
+	def loss(self,x,adj,labels, n_nodes, n_features, norm, pos_weight,L=1):
 
+		det=1e-10
+		labels_sub_u, labels_sub_a = labels
+		norm_u, norm_a = norm
+		pos_weight_u, pos_weight_a = pos_weight
 
-    def change_nn_grad_false(self):
-        for name, param in self.named_parameters():
-            if name not in ['pi_','mu_c','log_sigma2_c']:
-                param.requires_grad=False
+		L_rec_u=0
+		L_rec_a=0
 
-    def change_nn_grad_true(self):
-        for name, param in self.named_parameters():
-            if name not in ['pi_','mu_c','log_sigma2_c']:
-                param.requires_grad=True
+		mi=0
 
-    def loss(self,x,adj,labels, n_nodes, n_features, norm, pos_weight,L=1):
+		mu, logvar, mu_a, logvar_a = self.encoder(x, adj)
 
-        det=1e-10
-        labels_sub_u, labels_sub_a = labels
-        norm_u, norm_a = norm
-        pos_weight_u, pos_weight_a = pos_weight
+		# mutual information loss
 
-        L_rec_u=0
-        L_rec_a=0
+		# z_mu, z_sigma2_log = self.encoder(x)
+		# mi_a = self.mi_loss(mu,adj.to_dense(),x.to_dense())
+		for l in range(L):
 
-        mi=0
+			# z=torch.randn_like(z_mu)*torch.exp(z_sigma2_log/2)+z_mu
+			pred_adj, pred_x = self.decoder(mu,mu_a,logvar,logvar_a)
+			# L_rec+=F.binary_cross_entropy(x_pro,x)
 
-        mu, logvar, mu_a, logvar_a = self.encoder(x, adj)
+			cost_u = norm_u * F.binary_cross_entropy_with_logits(pred_adj, labels_sub_u, pos_weight = pos_weight_u)
+			cost_a = norm_a * F.binary_cross_entropy_with_logits(pred_x, labels_sub_a, pos_weight = pos_weight_a)
+			# cost_a =torch.Tensor(1).fill_(0)
 
-        # mutual information loss
+			L_rec_u += cost_u
+			L_rec_a += cost_a
 
-        # z_mu, z_sigma2_log = self.encoder(x)
-        # mi_a = self.mi_loss(mu,adj.to_dense(),x.to_dense())
-        for l in range(L):
 
-            # z=torch.randn_like(z_mu)*torch.exp(z_sigma2_log/2)+z_mu
-            pred_adj, pred_x = self.decoder(mu,mu_a,logvar,logvar_a)
-            # L_rec+=F.binary_cross_entropy(x_pro,x)
+		L_rec_u/=L
+		L_rec_a/=L
 
-            cost_u = norm_u * F.binary_cross_entropy_with_logits(pred_adj, labels_sub_u, pos_weight = pos_weight_u)
-            cost_a = norm_a * F.binary_cross_entropy_with_logits(pred_x, labels_sub_a, pos_weight = pos_weight_a)
-            # cost_a =torch.Tensor(1).fill_(0)
+		# z_a = self.reparameterize(mu_a,logvar_a)
+		# KLD_a = (0.5 / n_features) * torch.mean(torch.sum(-1 - 2 * logvar_a + mu_a.pow(2) + logvar_a.exp().pow(2), 1))
+		KLD_a = -(0.5 / n_features) * torch.mean(torch.sum(-1 - 2 * logvar_a + mu_a.pow(2) + logvar_a.exp().pow(2), 1))
+		# KLD_a =torch.Tensor(1).fill_(0)
 
-            L_rec_u += cost_u
-            L_rec_a += cost_a
+		# Loss=L_rec*x.size(1)
 
 
-        L_rec_u/=L
-        L_rec_a/=L
+		# log_sigma2_c=self.log_sigma2_c
+		# mu_c=self.mu_c
 
-        # z_a = self.reparameterize(mu_a,logvar_a)
-        # KLD_a = (0.5 / n_features) * torch.mean(torch.sum(-1 - 2 * logvar_a + mu_a.pow(2) + logvar_a.exp().pow(2), 1))
-        KLD_a = -(0.5 / n_features) * torch.mean(torch.sum(-1 - 2 * logvar_a + mu_a.pow(2) + logvar_a.exp().pow(2), 1))
-        # KLD_a =torch.Tensor(1).fill_(0)
+		# z = torch.randn_like(z_mu) * torch.exp(z_sigma2_log / 2) + z_mu
+		z = self.reparameterize(mu,logvar)
 
-        # Loss=L_rec*x.size(1)
+		# mod_loss=self.modularity_loss(z,adj)
+		# gamma_c=torch.exp(torch.log(self.pi_.unsqueeze(0))+self.gaussian_pdfs_log(z,self.mu_c,self.log_sigma2_c))+det
+		gamma_c=torch.exp(self.gaussian_pdfs_log(z,self.mu_c,self.log_sigma2_c))+det
+		# gamma_c  = self.cluster_choose(self.reparameterize(mu,logvar))
+		# print('gamma_c:',gamma_c)
 
+		gamma_c=gamma_c/(gamma_c.sum(1).view(-1,1))#batch_size*Clusters
+		gamma_c=F.softmax(gamma_c)
+		# print('gamma_c normalized:',gamma_c)
+		# print('gamma_c argmax:',torch.argmax(gamma_c,1))
+		print('gamma_c counter:',Counter(torch.argmax(gamma_c,1).tolist()))
 
-        # log_sigma2_c=self.log_sigma2_c
-        # mu_c=self.mu_c
+		# gamma_c=torch.nn.functional.one_hot(torch.argmax(gamma_c,1),self.args.nClusters)
 
-        # z = torch.randn_like(z_mu) * torch.exp(z_sigma2_log / 2) + z_mu
-        z = self.reparameterize(mu,logvar)
+		# self.pi_.data = (self.pi_/self.pi_.sum()).data # prior need to be re-normalized? In GMM, prior is based on gamma_c:https://brilliant.org/wiki/gaussian-mixture-model/
+		# self.pi_.data = gamma_c.mean(0).data # prior need to be re-normalized? In GMM, prior is based on gamma_c:https://brilliant.org/wiki/gaussian-mixture-model/
 
-        # mod_loss=self.modularity_loss(z,adj)
-        # gamma_c=torch.exp(torch.log(self.pi_.unsqueeze(0))+self.gaussian_pdfs_log(z,self.mu_c,self.log_sigma2_c))+det
-        gamma_c=torch.exp(self.gaussian_pdfs_log(z,self.mu_c,self.log_sigma2_c))+det
-        # print('gamma_c:',gamma_c)
+		KLD_u_c=-(0.5/n_nodes)*torch.mean(torch.sum(gamma_c*torch.sum(-1+self.log_sigma2_c.unsqueeze(0)-2*logvar.unsqueeze(1)+torch.exp(2*logvar.unsqueeze(1)-self.log_sigma2_c.unsqueeze(0))+(mu.unsqueeze(1)-self.mu_c.unsqueeze(0)).pow(2)/torch.exp(self.log_sigma2_c.unsqueeze(0)),2),1))
+		# KLD_u_c=-(0.5/n_nodes)*torch.mean(torch.sum(gamma_c*torch.sum(-1-2*logvar.unsqueeze(1)+torch.exp(2*logvar.unsqueeze(1))+(mu.unsqueeze(1)-self.mu_c.unsqueeze(0)).pow(2),2),1))
+		# temp_kld=-(0.5/n_nodes)*torch.sum((mu.unsqueeze(1)-self.mu_c.unsqueeze(0)).pow(2),2)
 
-        gamma_c=gamma_c/(gamma_c.sum(1).view(-1,1))#batch_size*Clusters
-        gamma_c=F.softmax(gamma_c)
-        # print('gamma_c normalized:',gamma_c)
-        # print('gamma_c argmax:',torch.argmax(gamma_c,1))
-        print('gamma_c counter:',Counter(torch.argmax(gamma_c,1).tolist()))
+		# KLD_u_c_test=-(0.5/n_nodes)*F.mse_loss(mu.unsqueeze(1),self.mu_c.unsqueeze(0),reduction='none')
+		# print('kld_u_c_test:',KLD_u_c_test.sum(2))
 
 
-        # self.pi_.data = (self.pi_/self.pi_.sum()).data # prior need to be re-normalized? In GMM, prior is based on gamma_c:https://brilliant.org/wiki/gaussian-mixture-model/
-        # self.pi_.data = gamma_c.mean(0).data # prior need to be re-normalized? In GMM, prior is based on gamma_c:https://brilliant.org/wiki/gaussian-mixture-model/
+		# KLD_u_c=-(0.5/n_nodes)*F.mse_loss(mu.unsqueeze(1),self.mu_c.unsqueeze(0))
 
-        KLD_u_c=-(0.5/n_nodes)*torch.mean(torch.sum(gamma_c*torch.sum(-1+self.log_sigma2_c.unsqueeze(0)-2*logvar.unsqueeze(1)+torch.exp(2*logvar.unsqueeze(1)-self.log_sigma2_c.unsqueeze(0))+(mu.unsqueeze(1)-self.mu_c.unsqueeze(0)).pow(2)/torch.exp(self.log_sigma2_c.unsqueeze(0)),2),1))
-        # KLD_u_c=-(0.5/n_nodes)*torch.mean(torch.sum(gamma_c*torch.sum(-1-2*logvar.unsqueeze(1)+torch.exp(2*logvar.unsqueeze(1))+(mu.unsqueeze(1)-self.mu_c.unsqueeze(0)).pow(2),2),1))
-        # temp_kld=-(0.5/n_nodes)*torch.sum((mu.unsqueeze(1)-self.mu_c.unsqueeze(0)).pow(2),2)
+		# KLD_u_c=(0.5 / n_nodes)*torch.mean(torch.sum(gamma_c*torch.sum(self.log_sigma2_c.unsqueeze(0)+\
+			# torch.exp(2*logvar.unsqueeze(1)-self.log_sigma2_c.unsqueeze(0))+\
+			# (mu.unsqueeze(1)-self.mu_c.unsqueeze(0)).pow(2)/torch.exp(self.log_sigma2_c.unsqueeze(0)),2),1))
 
-        # KLD_u_c_test=-(0.5/n_nodes)*F.mse_loss(mu.unsqueeze(1),self.mu_c.unsqueeze(0),reduction='none')
-        # print('kld_u_c_test:',KLD_u_c_test.sum(2))
+		mutual_dist = (1/(self.args.nClusters**2))*self.dist(self.mu_c)
 
+		# gamma_loss=-(1/self.args.nClusters)*torch.mean(torch.sum(gamma_c*torch.log(gamma_c),1))
+		# gamma_loss = (1 / self.args.nClusters) * torch.mean(torch.sum(gamma_c*torch.log(gamma_c),1)) - (0.5 / self.args.hid_dim)*torch.mean(torch.sum(1+2*logvar,1))
+		gamma_loss = -(1 / self.args.nClusters) * torch.mean(torch.sum(gamma_c*torch.log(gamma_c/self.pi_.unsqueeze(0)),1))
+		# gamma_loss = (1 / self.args.nClusters) * torch.mean(torch.sum(gamma_c*torch.log(gamma_c/self.pi_.unsqueeze(0)),1)) - (0.5 / self.args.hid_dim)*torch.mean(torch.sum(1+2*logvar,1))
 
-        # KLD_u_c=-(0.5/n_nodes)*F.mse_loss(mu.unsqueeze(1),self.mu_c.unsqueeze(0))
+		#soft assignment
+		Q = self.getSoftAssignments(z,self.mu_c,self.args.nClusters,self.args.hidden2,n_nodes)
+		P = self.calculateP(Q)
+		soft_cluster_loss = self.getKLDivLossExpression(Q,P)
 
-        # KLD_u_c=(0.5 / n_nodes)*torch.mean(torch.sum(gamma_c*torch.sum(self.log_sigma2_c.unsqueeze(0)+\
-            # torch.exp(2*logvar.unsqueeze(1)-self.log_sigma2_c.unsqueeze(0))+\
-            # (mu.unsqueeze(1)-self.mu_c.unsqueeze(0)).pow(2)/torch.exp(self.log_sigma2_c.unsqueeze(0)),2),1))
+		# return L_rec_u , L_rec_a , -KLD_u_c ,-KLD_a
+		return L_rec_u , L_rec_a , -KLD_u_c ,-KLD_a , -gamma_loss, -mutual_dist,soft_cluster_loss
+		# return L_rec_u , L_rec_a , -KLD_u_c ,-KLD_a , -gamma_loss,-mi_a
+		# return L_rec_u + L_rec_a + KLD_u_c + KLD_a + gamma_loss
 
-        mutual_dist = (1/(self.args.nClusters**2))*self.dist(self.mu_c)
 
-        # gamma_loss=-(1/self.args.nClusters)*torch.mean(torch.sum(gamma_c*torch.log(gamma_c),1))
-        # gamma_loss = (1 / self.args.nClusters) * torch.mean(torch.sum(gamma_c*torch.log(gamma_c),1)) - (0.5 / self.args.hid_dim)*torch.mean(torch.sum(1+2*logvar,1))
-        gamma_loss = -(1 / self.args.nClusters) * torch.mean(torch.sum(gamma_c*torch.log(gamma_c/self.pi_.unsqueeze(0)),1))
-        # gamma_loss = (1 / self.args.nClusters) * torch.mean(torch.sum(gamma_c*torch.log(gamma_c/self.pi_.unsqueeze(0)),1)) - (0.5 / self.args.hid_dim)*torch.mean(torch.sum(1+2*logvar,1))
+	def pre_train(self,x,adj,Y,pre_epoch=22):
+		'''
+		This function is used to initialize  cluster paramters: pi_, mu_c, log_sigma2_c.
+		-------------
+		paramters:
+		x: is the feature matrix of graph G.
+		adj: is the adjacent matrix of graph G.
+		Y: is the class label for each node in graph G.
+		'''
 
+		if not os.path.exists('./pretrain_model_{}_{}.pk'.format(self.args.dataset,pre_epoch)):
 
-        # return L_rec_u , L_rec_a , -KLD_u_c ,-KLD_a
-        return L_rec_u , 0.1*L_rec_a , -30*KLD_u_c ,-KLD_a , -gamma_loss, -0.05*mutual_dist
-        # return L_rec_u , L_rec_a , -KLD_u_c ,-KLD_a , -gamma_loss,-mi_a
-        # return L_rec_u + L_rec_a + KLD_u_c + KLD_a + gamma_loss
+			Loss=nn.MSELoss()
+			opti=Adam(self.parameters()) #all paramters in model
 
+			print('Pretraining......')
+			# epoch_bar=tqdm(range(pre_epoch))
+			# for _ in epoch_bar:
+			for _ in range(pre_epoch):
 
-    def pre_train(self,x,adj,Y,pre_epoch=22):
-        '''
-        This function is used to initialize  cluster paramters: pi_, mu_c, log_sigma2_c.
-        -------------
-        paramters:
-        x: is the feature matrix of graph G.
-        adj: is the adjacent matrix of graph G.
-        Y: is the class label for each node in graph G.
-        '''
+				self.train()
+				L=0
+				mu, logvar, mu_a, logvar_a	= self.encoder(x,adj)
+				pred_adj, pred_x = self.decoder(mu,mu_a,logvar,logvar_a)
 
-        if not os.path.exists('./pretrain_model_{}_{}.pk'.format(self.args.dataset,pre_epoch)):
+				loss=  Loss(pred_x,x) + Loss(pred_adj,adj)
 
-            Loss=nn.MSELoss()
-            opti=Adam(self.parameters()) #all paramters in model
+				L+=loss.detach().cpu().numpy()
 
-            print('Pretraining......')
-            # epoch_bar=tqdm(range(pre_epoch))
-            # for _ in epoch_bar:
-            for _ in range(pre_epoch):
+				opti.zero_grad()
+				loss.backward()
+				opti.step()
 
-                self.train()
-                L=0
-                mu, logvar, mu_a, logvar_a  = self.encoder(x,adj)
-                pred_adj, pred_x = self.decoder(mu,mu_a,logvar,logvar_a)
+				# epoch_bar.write('L2={:.4f}'.format(L))
+				print('L2={:.4f}'.format(L))
 
-                loss=  Loss(pred_x,x) + Loss(pred_adj,adj)
+			# self.gc2.load_state_dict(self.gc3.state_dict())
+			# self.linear_a2.load_state_dict(self.linear_a3.state_dict())
 
-                L+=loss.detach().cpu().numpy()
 
-                opti.zero_grad()
-                loss.backward()
-                opti.step()
+			# with torch.no_grad():
+				# mu, logvar, mu_a, logvar_a  = self.encoder(x,adj)
+				# assert F.mse_loss(mu, logvar) == 0
+				# assert F.mse_loss(mu_a, logvar_a) == 0
+				# Z = mu.data.numpy()
 
-                # epoch_bar.write('L2={:.4f}'.format(L))
-                print('L2={:.4f}'.format(L))
+			mu, logvar, mu_a, logvar_a	= self.encoder(x,adj)
+			Z  = self.reparameterize(mu,logvar)
 
-            # self.gc2.load_state_dict(self.gc3.state_dict())
-            # self.linear_a2.load_state_dict(self.linear_a3.state_dict())
+			gmm = GaussianMixture(n_components=self.args.nClusters, covariance_type='diag')
 
+			pre = gmm.fit_predict(Z.cpu().detach().numpy())
+			print('Acc={:.4f}%'.format(cluster_acc(pre, Y)[0] * 100))
 
-            # with torch.no_grad():
-                # mu, logvar, mu_a, logvar_a  = self.encoder(x,adj)
-                # assert F.mse_loss(mu, logvar) == 0
-                # assert F.mse_loss(mu_a, logvar_a) == 0
-                # Z = mu.data.numpy()
+			self.pi_.data = torch.from_numpy(gmm.weights_).float()
+			self.mu_c.data = torch.from_numpy(gmm.means_).float()
+			self.log_sigma2_c.data = torch.log(torch.from_numpy(gmm.covariances_).float())
 
-            mu, logvar, mu_a, logvar_a  = self.encoder(x,adj)
-            Z  = self.reparameterize(mu,logvar)
+			torch.save(self.state_dict(), './pretrain_model_{}_{}.pk'.format(self.args.dataset,pre_epoch))
+		else:
+			self.load_state_dict(torch.load('./pretrain_model_{}_{}.pk'.format(self.args.dataset,pre_epoch)))
 
-            gmm = GaussianMixture(n_components=self.args.nClusters, covariance_type='diag')
+	# def predict_nn(self,mu,logvar):
+		# z  = self.reparameterize(mu,logvar)
+		# gamma_c  = self.cluster_choose(self.reparameterize(mu,logvar))
 
-            pre = gmm.fit_predict(Z.cpu().detach().numpy())
-            print('Acc={:.4f}%'.format(cluster_acc(pre, Y)[0] * 100))
+		# print('gamma_c,normalized:',gamma_c)
+		# print('gamma_c argmax:',torch.argmax(gamma_c,1))
+		# print('gamma_c argmax counter:',Counter(torch.argmax(gamma_c,1).tolist()))
 
-            self.pi_.data = torch.from_numpy(gmm.weights_).float()
-            self.mu_c.data = torch.from_numpy(gmm.means_).float()
-            self.log_sigma2_c.data = torch.log(torch.from_numpy(gmm.covariances_).float())
-
-            torch.save(self.state_dict(), './pretrain_model_{}_{}.pk'.format(self.args.dataset,pre_epoch))
-        else:
-            self.load_state_dict(torch.load('./pretrain_model_{}_{}.pk'.format(self.args.dataset,pre_epoch)))
+		# gamma=gamma_c.detach().cpu().numpy()
 
-    def predict(self,mu, logvar):
-        # z_mu, z_sigma2_log, z_ma,z_a_sigma2_log = self.encoder(x,adj)
-        # mu, logvar, mu_a, logvar_a  = self.encoder(x,adj)
-        # z = torch.randn_like(mu) * torch.exp(z_sigma2_log / 2) + z_mu
-        det=1e-10
-        z  = self.reparameterize(mu,logvar)
-        pi = self.pi_
-        # log_sigma2_c = self.log_sigma2_c
-        # mu_c = self.mu_c
-        # gamma_c = torch.exp(torch.log(pi.unsqueeze(0))+self.gaussian_pdfs_log(z,mu_c,log_sigma2_c))
-        gamma_c = torch.exp(self.gaussian_pdfs_log(z,self.mu_c,self.log_sigma2_c))+det
-        print('gamma_c:',gamma_c)
-        gamma_c=gamma_c/(gamma_c.sum(1).view(-1,1))#batch_size*Clusters
-        gamma_c=F.softmax(gamma_c)
-        print('gamma_c,normalized:',gamma_c)
-        print('gamma_c argmax:',torch.argmax(gamma_c,1))
-        print('gamma_c argmax counter:',Counter(torch.argmax(gamma_c,1).tolist()))
-
-        gamma=gamma_c.detach().cpu().numpy()
-        return np.argmax(gamma,axis=1),gamma, z
-
-    def predict_dist(self,mu, logvar):
-        # z_mu, z_sigma2_log, z_ma,z_a_sigma2_log = self.encoder(x,adj)
-        # mu, logvar, mu_a, logvar_a  = self.encoder(x,adj)
-        # z = torch.randn_like(mu) * torch.exp(z_sigma2_log / 2) + z_mu
-        z  = self.reparameterize(mu,logvar)
-        pi = self.pi_
-        log_sigma2_c = self.log_sigma2_c
-        mu_c = self.mu_c
-        # gamma_c = torch.exp(self.gaussian_pdfs_log(z,mu_c,log_sigma2_c))
-
-        # gamma=gamma_c.detach().cpu().numpy()
-
-        gamma=[]
-        for e in range(z.shape[0]):
-            temp_dist=[]
-            for m in range(mu_c.shape[0]):
-                temp_dist.append(F.mse_loss(z[e],mu_c[m]).data)
-            gamma.append(temp_dist)
-
-        return np.argmin(gamma,axis=1),np.array(gamma)
-
-    def plot_tsne(self,dataset,epoch,z,true_label,pred_label):
-
-        tsne = TSNE(n_components=2, init='pca',perplexity=50.0)
-        data = torch.cat([z,self.mu_c.to('cpu')],dim=0).detach().numpy()
-        zs_tsne = tsne.fit_transform(data)
-
-        cluster_labels=set(true_label)
-        print(cluster_labels)
-        index_group= [np.array(true_label)==y for y in cluster_labels]
-        colors = cm.tab20(range(len(index_group)))
-
-        fig, ax = plt.subplots()
-        for index,c in zip(index_group,colors):
-            ax.scatter(zs_tsne[np.ix_(index), 0], zs_tsne[np.ix_(index), 1],color=c,s=2)
-
-        ax.scatter(zs_tsne[z.shape[0]:, 0], zs_tsne[z.shape[0]:, 1],marker='^',color='b',s=40)
-        plt.title('true label')
-        # ax.legend()
-        plt.savefig("./visualization/{}_{}_tsne_{}.pdf".format(dataset,epoch,'true_label'))
-
-        cluster_labels=set(pred_label)
-        print(cluster_labels)
-        index_group= [np.array(pred_label)==y for y in cluster_labels]
-        colors = cm.tab10(range(len(index_group)))
-
-        fig, ax = plt.subplots()
-        for index,c in zip(index_group,colors):
-            ax.scatter(zs_tsne[np.ix_(index), 0], zs_tsne[np.ix_(index), 1],color=c,s=2)
-
-        for index,c in enumerate(colors):
-            ax.scatter(zs_tsne[z.shape[0]+index:z.shape[0]+index+1, 0], zs_tsne[z.shape[0]+index:z.shape[0]+index+1, 1],marker='^',color=c,s=40)
-
-        plt.title('pred label')
-        # ax.legend()
-        plt.savefig("./visualization/{}_{}_tsne_{}.pdf".format(dataset,epoch,'pred_label'))
-
-    def gaussian_pdfs_log(self,x,mus,log_sigma2s):
-        G=[]
-        for c in range(self.args.nClusters):
-            G.append(self.gaussian_pdf_log(x,mus[c:c+1,:],log_sigma2s[c:c+1,:]).view(-1,1))
-        return torch.cat(G,1)
-
-
-    @staticmethod
-    def gaussian_pdf_log(x,mu,log_sigma2):
-        return -0.5*(torch.sum(np.log(np.pi*2)+log_sigma2+(x-mu).pow(2)/torch.exp(log_sigma2),1)) # np.pi*2, not square
-
-    def check_parameters(self):
-        for name, param in self.named_parameters():
-            if param.requires_grad:
-                print(name, param.data,param.data.shape)
-    def check_gradient(self):
-        for name, param in self.named_parameters():
-            if param.requires_grad:
-                print('grad: ',name)
-                print(param.grad,param.grad.shape)
+
+		# return np.argmax(gamma,axis=1),gamma, z
+
+
+
+	def predict(self,mu, logvar):
+		# z_mu, z_sigma2_log, z_ma,z_a_sigma2_log = self.encoder(x,adj)
+		# mu, logvar, mu_a, logvar_a  = self.encoder(x,adj)
+		# z = torch.randn_like(mu) * torch.exp(z_sigma2_log / 2) + z_mu
+		det=1e-10
+		z  = self.reparameterize(mu,logvar)
+		pi = self.pi_
+		# log_sigma2_c = self.log_sigma2_c
+		# mu_c = self.mu_c
+		# gamma_c = torch.exp(torch.log(pi.unsqueeze(0))+self.gaussian_pdfs_log(z,mu_c,log_sigma2_c))
+		gamma_c = torch.exp(self.gaussian_pdfs_log(mu,self.mu_c,self.log_sigma2_c))+det
+		# gamma_c = torch.exp(self.gaussian_pdfs_log(z,self.mu_c,self.log_sigma2_c))+det
+		print('gamma_c:',gamma_c)
+		gamma_c=gamma_c/(gamma_c.sum(1).view(-1,1))#batch_size*Clusters
+		# gamma_c=F.softmax(gamma_c)
+		print('gamma_c,normalized:',gamma_c)
+		print('gamma_c argmax:',torch.argmax(gamma_c,1))
+		print('gamma_c argmax counter:',Counter(torch.argmax(gamma_c,1).tolist()))
+
+		gamma=gamma_c.detach().cpu().numpy()
+
+		return np.argmax(gamma,axis=1),gamma, z
+
+	def predict_dist(self,mu, logvar):
+		# z_mu, z_sigma2_log, z_ma,z_a_sigma2_log = self.encoder(x,adj)
+		# mu, logvar, mu_a, logvar_a  = self.encoder(x,adj)
+		# z = torch.randn_like(mu) * torch.exp(z_sigma2_log / 2) + z_mu
+		z  = self.reparameterize(mu,logvar)
+		pi = self.pi_
+		log_sigma2_c = self.log_sigma2_c
+		mu_c = self.mu_c
+		# gamma_c = torch.exp(self.gaussian_pdfs_log(z,mu_c,log_sigma2_c))
+
+		# gamma=gamma_c.detach().cpu().numpy()
+
+		gamma=[]
+		for e in range(z.shape[0]):
+			temp_dist=[]
+			for m in range(mu_c.shape[0]):
+				temp_dist.append(F.mse_loss(z[e],mu_c[m]).data)
+			gamma.append(temp_dist)
+
+		return np.argmin(gamma,axis=1),np.array(gamma)
+
+	def plot_tsne(self,dataset,epoch,z,true_label,pred_label):
+
+		tsne = TSNE(n_components=2, init='pca',perplexity=50.0)
+		data = torch.cat([z,self.mu_c.to('cpu')],dim=0).detach().numpy()
+		zs_tsne = tsne.fit_transform(data)
+
+		cluster_labels=set(true_label)
+		print(cluster_labels)
+		index_group= [np.array(true_label)==y for y in cluster_labels]
+		colors = cm.tab20(range(len(index_group)))
+
+		fig, ax = plt.subplots()
+		for index,c in zip(index_group,colors):
+			ax.scatter(zs_tsne[np.ix_(index), 0], zs_tsne[np.ix_(index), 1],color=c,s=2)
+
+		ax.scatter(zs_tsne[z.shape[0]:, 0], zs_tsne[z.shape[0]:, 1],marker='^',color='b',s=40)
+		plt.title('true label')
+		# ax.legend()
+		plt.savefig("./visualization/{}_{}_tsne_{}.pdf".format(dataset,epoch,'true_label'))
+
+		cluster_labels=set(pred_label)
+		print(cluster_labels)
+		index_group= [np.array(pred_label)==y for y in cluster_labels]
+		colors = cm.tab10(range(len(index_group)))
+
+		fig, ax = plt.subplots()
+		for index,c in zip(index_group,colors):
+			ax.scatter(zs_tsne[np.ix_(index), 0], zs_tsne[np.ix_(index), 1],color=c,s=2)
+
+		for index,c in enumerate(colors):
+			ax.scatter(zs_tsne[z.shape[0]+index:z.shape[0]+index+1, 0], zs_tsne[z.shape[0]+index:z.shape[0]+index+1, 1],marker='^',color=c,s=40)
+
+		plt.title('pred label')
+		# ax.legend()
+		plt.savefig("./visualization/{}_{}_tsne_{}.pdf".format(dataset,epoch,'pred_label'))
+
+	def gaussian_pdfs_log(self,x,mus,log_sigma2s):
+		G=[]
+		for c in range(self.args.nClusters):
+			G.append(self.gaussian_pdf_log(x,mus[c:c+1,:],log_sigma2s[c:c+1,:]).view(-1,1))
+		return torch.cat(G,1)
+
+
+	@staticmethod
+	def gaussian_pdf_log(x,mu,log_sigma2):
+		return -0.5*(torch.sum(np.log(np.pi*2)+log_sigma2+(x-mu).pow(2)/torch.exp(log_sigma2),1)) # np.pi*2, not square
+
+	def check_parameters(self):
+		for name, param in self.named_parameters():
+			if param.requires_grad:
+				print(name, param.data,param.data.shape)
+	def check_gradient(self):
+		for name, param in self.named_parameters():
+			if param.requires_grad:
+				print('grad: ',name)
+				print(param.grad,param.grad.shape)
+
+	def calculateP(self, Q):
+		# Function to calculate the desired distribution Q^2, for more details refer to DEC paper
+		f = Q.sum(dim=0)
+		pij_numerator = Q * Q
+		pij_numerator = pij_numerator / f
+		normalizer_p = pij_numerator.sum(dim=1).reshape((Q.shape[0], 1))
+		P = pij_numerator / normalizer_p
+		return P
+
+	def getKLDivLossExpression(self, Q_expression, P_expression):
+		# Loss = KL Divergence between the two distributions
+		log_arg = P_expression / Q_expression
+		log_exp = torch.log(log_arg)
+		sum_arg = P_expression * log_exp
+		loss = torch.sum(sum_arg)
+		return loss
+
+	def getSoftAssignments(self,latent_space, cluster_centers, num_clusters, latent_space_dim, num_samples):
+		'''
+		Returns cluster membership distribution for each sample
+		:param latent_space: latent space representation of inputs
+		:param cluster_centers: the coordinates of cluster centers in latent space
+		:param num_clusters: total number of clusters
+		:param latent_space_dim: dimensionality of latent space
+		:param num_samples: total number of input samples
+		:return: soft assigment based on the equation qij = (1+|zi - uj|^2)^(-1)/sum_j'((1+|zi - uj'|^2)^(-1))
+		'''
+		# z_expanded = latent_space.reshape((num_samples, 1, latent_space_dim))
+		# z_expanded = T.tile(z_expanded, (1, num_clusters, 1))
+		# u_expanded = T.tile(cluster_centers, (num_samples, 1, 1))
+
+		# distances_from_cluster_centers = (z_expanded - u_expanded).norm(2, axis=2)
+		# qij_numerator = 1 + distances_from_cluster_centers * distances_from_cluster_centers
+		# qij_numerator = 1 / qij_numerator
+		# normalizer_q = qij_numerator.sum(axis=1).reshape((num_samples, 1))
+
+		# return qij_numerator / normalizer_q
+
+
+		distances_from_cluster_centers = (latent_space.unsqueeze(1)- cluster_centers.unsqueeze(0)).norm(2, dim=2)
+		qij_numerator = 1 + distances_from_cluster_centers * distances_from_cluster_centers
+		qij_numerator = 1 / qij_numerator
+		normalizer_q = qij_numerator.sum(dim=1).reshape((num_samples, 1))
+
+		return qij_numerator / normalizer_q
diff --git a/model_bk.py b/model_bk.py
new file mode 100644
index 0000000..8f655de
--- /dev/null
+++ b/model_bk.py
@@ -0,0 +1,677 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.optim import Adam
+from sklearn.mixture import GaussianMixture
+from sklearn.metrics import accuracy_score
+from sklearn.manifold import TSNE
+import matplotlib.pyplot as plt
+import matplotlib.cm as cm
+import numpy as np
+import os
+from tqdm import tqdm
+
+from layers import GraphConvolution, GraphConvolutionSparse, Linear, InnerDecoder, InnerProductDecoder
+from utils import cluster_acc
+
+from utils_smiles import *
+from estimators import estimate_mutual_information
+from collections import Counter
+
+class GCNModelAE(nn.Module):
+    def __init__(self, input_feat_dim, n_nodes, hidden_dim1, hidden_dim2, dropout,args):
+        super(GCNModelAE, self).__init__()
+
+        self.args = args
+        self.gc1 = GraphConvolutionSparse(input_feat_dim, hidden_dim1, dropout, act=torch.relu)
+        self.gc2 = GraphConvolution(hidden_dim1, hidden_dim2, dropout, act=lambda x: x)
+        self.dc = InnerProductDecoder(dropout, act=lambda x: x)
+        # self.dc = InnerDecoder(dropout, act=lambda x: x)
+
+    def forward(self, x, adj):
+        z = self.gc1(x,adj)
+        z = self.gc2(z,adj)
+        return self.dc(z),z,None
+
+
+    def loss(self,pred_adj,labels, n_nodes, n_features, norm, pos_weight,L=1):
+
+        cost = norm * F.binary_cross_entropy_with_logits(pred_adj, labels,pos_weight = pos_weight)
+        return cost,
+
+    def check_parameters(self):
+        for name, param in self.named_parameters():
+            if param.requires_grad:
+                print(name, param.data,param.data.shape)
+
+class GCNModelVAE(nn.Module):
+    def __init__(self, input_feat_dim, n_nodes, hidden_dim1, hidden_dim2, dropout,args):
+        super(GCNModelVAE, self).__init__()
+
+        self.args = args
+        self.gc1 = GraphConvolutionSparse(input_feat_dim, hidden_dim1, dropout, act=torch.relu)
+        self.gc2 = GraphConvolution(hidden_dim1, hidden_dim2, dropout, act=lambda x: x)
+        self.gc3 = GraphConvolution(hidden_dim1, hidden_dim2, dropout, act=lambda x: x)
+        self.dc = InnerProductDecoder(dropout, act=lambda x: x)
+        # self.dc = InnerDecoder(dropout, act=lambda x: x)
+
+
+    def encoder(self, x, adj):
+        hidden1 = self.gc1(x, adj)
+        return self.gc2(hidden1, adj), self.gc3(hidden1, adj)
+
+    def decoder(self,mu,logvar):
+
+        z_u = self.reparameterize(mu, logvar)
+
+        return self.dc(z_u)
+
+    def reparameterize(self, mu, logvar):
+        std = torch.exp(logvar)
+        eps = torch.randn_like(std)
+        return eps.mul(std).add_(mu)
+
+        # if self.training:
+            # std = torch.exp(logvar)
+            # eps = torch.randn_like(std)
+            # return eps.mul(std).add_(mu)
+        # else:
+            # return mu
+
+    def forward(self, x, adj):
+
+        mu, logvar = self.encoder(x, adj)
+        z_u = self.reparameterize(mu, logvar)
+        # z_a = self.reparameterize(mu_a,logvar_a)
+        return self.dc(z_u),mu, logvar
+
+
+    def loss(self,x,adj,labels, n_nodes, n_features, norm, pos_weight,L=1):
+
+        det=1e-10
+        norm_u = norm
+        pos_weight_u= pos_weight
+
+        L_rec_u=0
+
+        mu, logvar = self.encoder(x, adj)
+        # z_mu, z_sigma2_log = self.encoder(x)
+        for l in range(L):
+
+            pred_adj = self.decoder(mu,logvar)
+
+            cost_u = norm * F.binary_cross_entropy_with_logits(pred_adj, labels ,pos_weight = pos_weight)
+
+            L_rec_u += cost_u
+
+        L_rec_u/=L
+
+        KLD = -0.5 / n_nodes * torch.mean(torch.sum(1 + 2 * logvar - mu.pow(2) - logvar.exp().pow(2),1))
+        return L_rec_u, KLD
+
+
+    def check_parameters(self):
+        for name, param in self.named_parameters():
+            if param.requires_grad:
+                print(name, param.data,param.data.shape)
+
+
+class GCNModelVAECD(nn.Module):
+    def __init__(self, input_feat_dim, n_nodes, hidden_dim1, hidden_dim2, dropout,args):
+        super(GCNModelVAECD, self).__init__()
+
+        self.args = args
+        self.gc1 = GraphConvolutionSparse(input_feat_dim, hidden_dim1, dropout, act=torch.relu)
+        self.gc2 = GraphConvolution(hidden_dim1, hidden_dim2, dropout, act=lambda x: x)
+        self.gc3 = GraphConvolution(hidden_dim1, hidden_dim2, dropout, act=lambda x: x)
+        self.dc = InnerProductDecoder(dropout, act=lambda x: x)
+        # self.dc = InnerDecoder(dropout, act=lambda x: x)
+
+        #for embedding attributes/features
+        # self.linear_a1= Linear(n_nodes, hidden_dim1, act = torch.tanh,sparse_inputs=True) # the input dim is the number of nodes
+        # self.linear_a2= Linear(hidden_dim1, hidden_dim2, act = lambda x:x)
+        # self.linear_a3= Linear(hidden_dim1, hidden_dim2, act = lambda x:x)
+
+
+        self.pi_=nn.Parameter(torch.FloatTensor(args.nClusters,).fill_(1)/args.nClusters,requires_grad=True)
+        self.mu_c=nn.Parameter(torch.randn(args.nClusters,hidden_dim2),requires_grad=True)
+        self.log_sigma2_c=nn.Parameter(torch.randn(args.nClusters,hidden_dim2),requires_grad=True)
+
+    def encoder(self, x, adj):
+        hidden1 = self.gc1(x, adj)
+        # hidden_a1 = self.linear_a1(x.t()) # transpose the input feature matrix
+
+        return self.gc2(hidden1, adj), self.gc3(hidden1, adj)
+
+    def decoder(self,mu,logvar):
+
+        z_u = self.reparameterize(mu, logvar)
+        # z_a = self.reparameterize(mu_a,logvar_a)
+
+        return self.dc(z_u)
+
+    def reparameterize(self, mu, logvar):
+        if self.training:
+            std = torch.exp(logvar)
+            eps = torch.randn_like(std)
+            return eps.mul(std).add_(mu)
+        else:
+            return mu
+
+    def forward(self, x, adj):
+
+        mu, logvar = self.encoder(x, adj)
+        z_u = self.reparameterize(mu, logvar)
+        # z_a = self.reparameterize(mu_a,logvar_a)
+        return self.dc(z_u),mu, logvar
+
+
+    def loss(self,x,adj,labels, n_nodes, n_features, norm, pos_weight,L=1):
+
+        det=1e-10
+        norm_u = norm
+        pos_weight_u= pos_weight
+
+        L_rec_u=0
+
+        mu, logvar = self.encoder(x, adj)
+        hidden_dim2 = mu.shape[1]
+
+        # z_mu, z_sigma2_log = self.encoder(x)
+        for l in range(L):
+
+            # z=torch.randn_like(z_mu)*torch.exp(z_sigma2_log/2)+z_mu
+            pred_adj = self.decoder(mu,logvar)
+            # L_rec+=F.binary_cross_entropy(x_pro,x)
+
+            # cost_u = norm * F.binary_cross_entropy_with_logits(pred_adj, labels_sub_u,pos_weight = pos_weight)
+            cost_u = norm * F.binary_cross_entropy_with_logits(pred_adj, labels ,pos_weight = pos_weight)
+            # cost_a = norm_a * F.binary_cross_entropy_with_logits(pred_x, labels_sub_a, pos_weight = pos_weight_a)
+            # cost_a =torch.Tensor(1).fill_(0)
+
+            L_rec_u += cost_u
+            # L_rec_a += cost_a
+
+        L_rec_u/=L
+        # L_rec_a/=L
+
+        # z_a = self.reparameterize(mu_a,logvar_a)
+        # KLD_a = (0.5 / n_features) * torch.mean(torch.sum(-1 - 2 * logvar_a + mu_a.pow(2) + logvar_a.exp().pow(2), 1))
+        # KLD_a =torch.Tensor(1).fill_(0)
+
+        # Loss=L_rec*x.size(1)
+
+
+        self.pi_.data = (self.pi_/self.pi_.sum()).data
+        # log_sigma2_c=self.log_sigma2_c
+        # mu_c=self.mu_c
+
+        # z = torch.randn_like(z_mu) * torch.exp(z_sigma2_log / 2) + z_mu
+        z = self.reparameterize(mu,logvar)
+
+        gamma_c=torch.exp(torch.log(self.pi_.unsqueeze(0))+self.gaussian_pdfs_log(z,self.mu_c,self.log_sigma2_c))+det
+        gamma_c = F.softmax(gamma_c) # is softmax a good way?
+
+        gamma_c=gamma_c/(gamma_c.sum(1).view(-1,1)) #shape: batch_size*Clusters
+        self.pi_.data = gamma_c.mean(0).data # prior need to be re-normalized? In GMM, prior is based on gamma_c:https://brilliant.org/wiki/gaussian-mixture-model/
+
+        # KLD_u_c=(0.5 / n_nodes)*torch.mean(torch.sum(gamma_c*torch.sum(self.log_sigma2_c.unsqueeze(0)+\
+            # torch.exp(2*logvar.unsqueeze(1)-self.log_sigma2_c.unsqueeze(0))+\
+            # (mu.unsqueeze(1)-self.mu_c.unsqueeze(0)).pow(2)/torch.exp(self.log_sigma2_c.unsqueeze(0)),2),1))
+
+        # KLD_u_c-= (0.5/n_nodes)*torch.mean(torch.sum(1+2*logvar,1))
+        # gamma_loss = (1 / self.args.nClusters) * torch.mean(torch.sum(gamma_c*torch.log(gamma_c/self.pi_.unsqueeze(0)),1)) - (0.5 / hidden_dim2)*torch.mean(torch.sum(1+2*logvar,1))
+
+        KLD_u_c=-(0.5/n_nodes)*torch.mean(torch.sum(gamma_c*torch.sum(-1+self.log_sigma2_c.unsqueeze(0)-2*logvar.unsqueeze(1)+
+            torch.exp(2*logvar.unsqueeze(1)-self.log_sigma2_c.unsqueeze(0))+
+            (mu.unsqueeze(1)-self.mu_c.unsqueeze(0)).pow(2)/torch.exp(self.log_sigma2_c.unsqueeze(0)),2),1))
+
+        gamma_loss = -(1 / self.args.nClusters) * torch.mean(torch.sum(gamma_c*torch.log(gamma_c/self.pi_.unsqueeze(0)),1))
+
+        return L_rec_u,-KLD_u_c,-gamma_loss
+
+    def pre_train(self,x,adj,Y,pre_epoch=10):
+        '''
+        This function is used to initialize  cluster paramters: pi_, mu_c, log_sigma2_c.
+        -------------
+        paramters:
+        x: is the feature matrix of graph G.
+        adj: is the adjacent matrix of graph G.
+        Y: is the class label for each node in graph G.
+        '''
+
+        if  not os.path.exists('./pretrain_model_{}.pk'.format(self.args.dataset)):
+
+            Loss=nn.MSELoss()
+            opti=Adam(self.parameters()) #all paramters in model
+
+            print('Pretraining......')
+            # epoch_bar=tqdm(range(pre_epoch))
+            # for _ in epoch_bar:
+            for _ in range(pre_epoch):
+
+                self.train()
+                L=0
+                mu, logvar  = self.encoder(x,adj)
+                pred_adj = self.decoder(mu,logvar)
+
+                loss=  Loss(pred_adj,adj.to_dense())
+
+                L+=loss.detach().cpu().numpy()
+
+                opti.zero_grad()
+                loss.backward()
+                opti.step()
+
+                # epoch_bar.write('L2={:.4f}'.format(L))
+                print('L2={:.4f}'.format(L))
+
+            self.gc2.load_state_dict(self.gc3.state_dict())
+            # self.linear_a2.load_state_dict(self.linear_a3.state_dict())
+
+            with torch.no_grad():
+                mu, logvar  = self.encoder(x,adj)
+                assert F.mse_loss(mu, logvar) == 0
+                # assert F.mse_loss(mu_a, logvar_a) == 0
+                Z = mu.data.numpy()
+
+
+            gmm = GaussianMixture(n_components=self.args.nClusters, covariance_type='diag')
+
+            pre = gmm.fit_predict(Z)
+            print('Acc={:.4f}%'.format(cluster_acc(pre, Y)[0] * 100))
+
+            self.pi_.data = torch.from_numpy(gmm.weights_).float()
+            self.mu_c.data = torch.from_numpy(gmm.means_).float()
+            self.log_sigma2_c.data = torch.log(torch.from_numpy(gmm.covariances_).float())
+
+            torch.save(self.state_dict(), './pretrain_model_{}.pk'.format(self.args.dataset))
+        else:
+            self.load_state_dict(torch.load('./pretrain_model_{}.pk'.format(self.args.dataset)))
+
+    def predict(self,mu, logvar):
+        # z_mu, z_sigma2_log, z_ma,z_a_sigma2_log = self.encoder(x,adj)
+        # mu, logvar, mu_a, logvar_a  = self.encoder(x,adj)
+        # z = torch.randn_like(mu) * torch.exp(logvar) + mu
+        z  = self.reparameterize(mu,logvar)
+        pi = self.pi_
+        log_sigma2_c = self.log_sigma2_c
+        mu_c = self.mu_c
+        gamma_c = torch.exp(torch.log(pi.unsqueeze(0))+self.gaussian_pdfs_log(z,mu_c,log_sigma2_c))
+
+        gamma=gamma_c.detach().cpu().numpy()
+
+        return np.argmax(gamma,axis=1),gamma
+
+
+    def gaussian_pdfs_log(self,x,mus,log_sigma2s):
+        G=[]
+        for c in range(self.args.nClusters):
+            G.append(self.gaussian_pdf_log(x,mus[c:c+1,:],log_sigma2s[c:c+1,:]).view(-1,1))
+        return torch.cat(G,1)
+
+
+    @staticmethod
+    def gaussian_pdf_log(x,mu,log_sigma2):
+        return -0.5*(torch.sum(np.log(np.pi*2)+log_sigma2+(x-mu).pow(2)/torch.exp(log_sigma2),1))
+
+    def check_parameters(self):
+        for name, param in self.named_parameters():
+            if param.requires_grad:
+                print(name, param.data,param.data.shape)
+
+class GCNModelVAECE(nn.Module):
+    def __init__(self, input_feat_dim, n_nodes, hidden_dim1, hidden_dim2, dropout,args):
+        super(GCNModelVAECE, self).__init__()
+
+
+        self.args = args
+        self.gc1 = GraphConvolutionSparse(input_feat_dim, hidden_dim1, dropout, act=torch.relu)
+        self.gc2 = GraphConvolution(hidden_dim1, hidden_dim2, dropout, act=lambda x: x)
+        self.gc3 = GraphConvolution(hidden_dim1, hidden_dim2, dropout, act=lambda x: x)
+        # self.dc = InnerProductDecoder(dropout, act=lambda x: x)
+        self.dc = InnerDecoder(dropout, act=lambda x: x)
+
+        #for embedding attributes/features
+        self.linear_a1= Linear(n_nodes, hidden_dim1, act = torch.tanh,sparse_inputs=True) # the input dim is the number of nodes
+        self.linear_a2= Linear(hidden_dim1, hidden_dim2, act = lambda x:x)
+        self.linear_a3= Linear(hidden_dim1, hidden_dim2, act = lambda x:x)
+
+        #modularity layer
+        self.modulairty_layer = Linear(hidden_dim2,args.nClusters,act=torch.relu)
+        # cluster choosing
+        self.cluster_choose= Linear(hidden_dim2,args.nClusters,act=torch.sigmoid)
+
+
+        self.pi_=nn.Parameter(torch.FloatTensor(args.nClusters,).fill_(1)/args.nClusters,requires_grad=True)
+        self.mu_c=nn.Parameter(torch.FloatTensor(args.nClusters,hidden_dim2).fill_(0),requires_grad=True)
+        self.log_sigma2_c=nn.Parameter(torch.FloatTensor(args.nClusters,hidden_dim2).fill_(1),requires_grad=True)
+
+        # torch.nn.init.xavier_normal_(self.mu_c)
+        # torch.nn.init.xavier_normal_(self.log_sigma2_c)
+
+        # calculate mi
+
+        # critic_params = {'dim_x': x.shape[1],'dim_y':y.shape[1],'layers': 2,'embed_dim': 32,'hidden_dim': 64,'activation': 'relu',}
+        # self.critic_structure = ConcatCritic(hidden_dim2,n_nodes,256,3,'relu',rho=None,)
+        # self.critic_feature = ConcatCritic(hidden_dim2,input_feat_dim,256,3,'relu',rho=None,)
+
+    def encoder(self, x, adj):
+        hidden1 = self.gc1(x, adj)
+        hidden_a1 = self.linear_a1(x.t()) # transpose the input feature matrix
+        return self.gc2(hidden1, adj), self.gc3(hidden1, adj), self.linear_a2(hidden_a1),self.linear_a3(hidden_a1)
+
+    def decoder(self,mu,mu_a,logvar,logvar_a):
+
+        z_u = self.reparameterize(mu, logvar)
+        z_a = self.reparameterize(mu_a,logvar_a)
+        return self.dc((z_u,z_a))
+
+    def reparameterize(self, mu, logvar):
+        if self.training:
+            std = torch.exp(logvar)
+            eps = torch.randn_like(std)
+            return eps.mul(std).add_(mu)
+        else:
+            return mu
+
+    def forward(self, x, adj):
+
+        mu, logvar, mu_a, logvar_a = self.encoder(x, adj)
+        z_u = self.reparameterize(mu, logvar)
+        z_a = self.reparameterize(mu_a,logvar_a)
+        return self.dc((z_u,z_a)),mu, logvar, mu_a, logvar_a
+
+    def modularity_loss(self, z,adj):
+
+        adj = adj.to_dense()
+        H = self.modulairty_layer(z)
+        assert H.shape[0]==z.shape[0]
+
+        n = torch.tensor(1.0*z.shape[0])
+
+        H_norm = n.sqrt()*H.sqrt()/(H.sqrt().sum())
+        print("H_norm shape",H_norm.shape)
+        print("H_norm ",H_norm)
+        m = (adj-torch.eye(adj.shape[0])).sum()/2
+        D = (adj-torch.eye(adj.shape[0])).sum(1) # the degree of nodes, adj includes self loop
+        B = (adj-torch.eye(adj.shape[0]))-torch.matmul(D.view(-1,1),D.view(1,-1))/(2*m) # modularity matrix
+        mod_loss=torch.trace(torch.matmul(torch.matmul(H_norm.t(),B),H_norm)/(4*m))
+        print("mod_loss",mod_loss)
+
+        return mod_loss
+
+    def dist(self,x):
+        # x = x/torch.norm(x,2,dim=1).view(-1,1)
+        assert len(x.size()) == 2
+        norm = (x ** 2).sum(1).view(-1, 1)
+        dn = (norm + norm.view(1, -1)) - 2.0 * (x @ x.t())
+        return torch.sum(torch.relu(dn).sqrt())
+
+    def mi_loss(self,z,x,a):
+        # critic_params = {'dim_x': x.shape[1],'dim_y':y.shape[1],'layers': 2,'embed_dim': 32,'hidden_dim': 64,'activation': 'relu',}
+        # critic = ConcatCritic(rho=None,**critic_params)
+        indice = torch.randperm(len(z))[0:50]
+        # mi_x = estimate_mutual_information('dv',z[indice],x[indice],self.critic_structure)
+        mi_a = estimate_mutual_information('js',z[indice],a[indice],self.critic_feature)
+        return mi_a
+
+    def change_cluster_grad_false(self):
+        for name, param in self.named_parameters():
+            if name in ['pi_','mu_c','log_sigma2_c']:
+                param.requires_grad=False
+
+    def change_cluster_grad_true(self):
+        for name, param in self.named_parameters():
+            if name in ['pi_','mu_c','log_sigma2_c']:
+                param.requires_grad=True
+
+
+    def change_nn_grad_false(self):
+        for name, param in self.named_parameters():
+            if name not in ['pi_','mu_c','log_sigma2_c']:
+                param.requires_grad=False
+
+    def change_nn_grad_true(self):
+        for name, param in self.named_parameters():
+            if name not in ['pi_','mu_c','log_sigma2_c']:
+                param.requires_grad=True
+
+    def loss(self,x,adj,labels, n_nodes, n_features, norm, pos_weight,L=1):
+
+        det=1e-10
+        labels_sub_u, labels_sub_a = labels
+        norm_u, norm_a = norm
+        pos_weight_u, pos_weight_a = pos_weight
+
+        L_rec_u=0
+        L_rec_a=0
+
+        mi=0
+
+        mu, logvar, mu_a, logvar_a = self.encoder(x, adj)
+
+        # mutual information loss
+
+        # z_mu, z_sigma2_log = self.encoder(x)
+        # mi_a = self.mi_loss(mu,adj.to_dense(),x.to_dense())
+        for l in range(L):
+
+            # z=torch.randn_like(z_mu)*torch.exp(z_sigma2_log/2)+z_mu
+            pred_adj, pred_x = self.decoder(mu,mu_a,logvar,logvar_a)
+            # L_rec+=F.binary_cross_entropy(x_pro,x)
+
+            cost_u = norm_u * F.binary_cross_entropy_with_logits(pred_adj, labels_sub_u, pos_weight = pos_weight_u)
+            cost_a = norm_a * F.binary_cross_entropy_with_logits(pred_x, labels_sub_a, pos_weight = pos_weight_a)
+            # cost_a =torch.Tensor(1).fill_(0)
+
+            L_rec_u += cost_u
+            L_rec_a += cost_a
+
+
+        L_rec_u/=L
+        L_rec_a/=L
+
+        # z_a = self.reparameterize(mu_a,logvar_a)
+        # KLD_a = (0.5 / n_features) * torch.mean(torch.sum(-1 - 2 * logvar_a + mu_a.pow(2) + logvar_a.exp().pow(2), 1))
+        KLD_a = -(0.5 / n_features) * torch.mean(torch.sum(-1 - 2 * logvar_a + mu_a.pow(2) + logvar_a.exp().pow(2), 1))
+        # KLD_a =torch.Tensor(1).fill_(0)
+
+        # Loss=L_rec*x.size(1)
+
+
+        # log_sigma2_c=self.log_sigma2_c
+        # mu_c=self.mu_c
+
+        # z = torch.randn_like(z_mu) * torch.exp(z_sigma2_log / 2) + z_mu
+        z = self.reparameterize(mu,logvar)
+
+        # mod_loss=self.modularity_loss(z,adj)
+        # gamma_c=torch.exp(torch.log(self.pi_.unsqueeze(0))+self.gaussian_pdfs_log(z,self.mu_c,self.log_sigma2_c))+det
+        # gamma_c=torch.exp(self.gaussian_pdfs_log(z,self.mu_c,self.log_sigma2_c))+det
+        gamma_c  = self.cluster_choose(z)
+        # print('gamma_c:',gamma_c)
+
+        gamma_c=gamma_c/(gamma_c.sum(1).view(-1,1))#batch_size*Clusters
+        gamma_c=F.softmax(gamma_c)
+        # print('gamma_c normalized:',gamma_c)
+        # print('gamma_c argmax:',torch.argmax(gamma_c,1))
+        print('gamma_c counter:',Counter(torch.argmax(gamma_c,1).tolist()))
+
+
+        # self.pi_.data = (self.pi_/self.pi_.sum()).data # prior need to be re-normalized? In GMM, prior is based on gamma_c:https://brilliant.org/wiki/gaussian-mixture-model/
+        # self.pi_.data = gamma_c.mean(0).data # prior need to be re-normalized? In GMM, prior is based on gamma_c:https://brilliant.org/wiki/gaussian-mixture-model/
+
+        KLD_u_c=-(0.5/n_nodes)*torch.mean(torch.sum(gamma_c*torch.sum(-1+self.log_sigma2_c.unsqueeze(0)-2*logvar.unsqueeze(1)+torch.exp(2*logvar.unsqueeze(1)-self.log_sigma2_c.unsqueeze(0))+(mu.unsqueeze(1)-self.mu_c.unsqueeze(0)).pow(2)/torch.exp(self.log_sigma2_c.unsqueeze(0)),2),1))
+
+        # KLD_u_c_test=-(0.5/n_nodes)*F.mse_loss(mu.unsqueeze(1),self.mu_c.unsqueeze(0),reduction='none')
+        # print('kld_u_c_test:',KLD_u_c_test.sum(2))
+
+
+        # KLD_u_c=-(0.5/n_nodes)*F.mse_loss(mu.unsqueeze(1),self.mu_c.unsqueeze(0))
+
+        # KLD_u_c=(0.5 / n_nodes)*torch.mean(torch.sum(gamma_c*torch.sum(self.log_sigma2_c.unsqueeze(0)+\
+            # torch.exp(2*logvar.unsqueeze(1)-self.log_sigma2_c.unsqueeze(0))+\
+            # (mu.unsqueeze(1)-self.mu_c.unsqueeze(0)).pow(2)/torch.exp(self.log_sigma2_c.unsqueeze(0)),2),1))
+
+        mutual_dist = (1/(self.args.nClusters**2))*self.dist(self.mu_c)
+
+        gamma_loss=-(1/self.args.nClusters)*torch.mean(torch.sum(gamma_c*torch.log(gamma_c),1))
+        # gamma_loss = (1 / self.args.nClusters) * torch.mean(torch.sum(gamma_c*torch.log(gamma_c),1)) - (0.5 / self.args.hid_dim)*torch.mean(torch.sum(1+2*logvar,1))
+        # gamma_loss = -(1 / self.args.nClusters) * torch.mean(torch.sum(gamma_c*torch.log(gamma_c/self.pi_.unsqueeze(0)),1))
+        # gamma_loss = (1 / self.args.nClusters) * torch.mean(torch.sum(gamma_c*torch.log(gamma_c/self.pi_.unsqueeze(0)),1)) - (0.5 / self.args.hid_dim)*torch.mean(torch.sum(1+2*logvar,1))
+
+
+        # return L_rec_u , L_rec_a , -KLD_u_c ,-KLD_a
+        return L_rec_u , L_rec_a , -KLD_u_c ,-KLD_a , -gamma_loss, -0.05*mutual_dist
+        # return L_rec_u , L_rec_a , -KLD_u_c ,-KLD_a , -gamma_loss,-mi_a
+        # return L_rec_u + L_rec_a + KLD_u_c + KLD_a + gamma_loss
+
+
+    def pre_train(self,x,adj,Y,pre_epoch=20):
+        '''
+        This function is used to initialize  cluster paramters: pi_, mu_c, log_sigma2_c.
+        -------------
+        paramters:
+        x: is the feature matrix of graph G.
+        adj: is the adjacent matrix of graph G.
+        Y: is the class label for each node in graph G.
+        '''
+
+        if not os.path.exists('./pretrain_model_{}.pk'.format(self.args.dataset)):
+
+            Loss=nn.MSELoss()
+            opti=Adam(self.parameters()) #all paramters in model
+
+            print('Pretraining......')
+            # epoch_bar=tqdm(range(pre_epoch))
+            # for _ in epoch_bar:
+            for _ in range(pre_epoch):
+
+                self.train()
+                L=0
+                mu, logvar, mu_a, logvar_a  = self.encoder(x,adj)
+                pred_adj, pred_x = self.decoder(mu,mu_a,logvar,logvar_a)
+
+                loss=  Loss(pred_x,x.to_dense()) + Loss(pred_adj,adj.to_dense())
+
+                L+=loss.detach().cpu().numpy()
+
+                opti.zero_grad()
+                loss.backward()
+                opti.step()
+
+                # epoch_bar.write('L2={:.4f}'.format(L))
+                print('L2={:.4f}'.format(L))
+
+            self.gc2.load_state_dict(self.gc3.state_dict())
+            self.linear_a2.load_state_dict(self.linear_a3.state_dict())
+
+
+            with torch.no_grad():
+                mu, logvar, mu_a, logvar_a  = self.encoder(x,adj)
+                assert F.mse_loss(mu, logvar) == 0
+                assert F.mse_loss(mu_a, logvar_a) == 0
+                Z = mu.data.numpy()
+
+
+            gmm = GaussianMixture(n_components=self.args.nClusters, covariance_type='diag')
+
+            pre = gmm.fit_predict(Z)
+            print('Acc={:.4f}%'.format(cluster_acc(pre, Y)[0] * 100))
+
+            self.pi_.data = torch.from_numpy(gmm.weights_).float()
+            self.mu_c.data = torch.from_numpy(gmm.means_).float()
+            self.log_sigma2_c.data = torch.log(torch.from_numpy(gmm.covariances_).float())
+
+            torch.save(self.state_dict(), './pretrain_model_{}.pk'.format(self.args.dataset))
+        else:
+            self.load_state_dict(torch.load('./pretrain_model_{}.pk'.format(self.args.dataset)))
+
+    def predict(self,mu, logvar):
+        # z_mu, z_sigma2_log, z_ma,z_a_sigma2_log = self.encoder(x,adj)
+        # mu, logvar, mu_a, logvar_a  = self.encoder(x,adj)
+        # z = torch.randn_like(mu) * torch.exp(z_sigma2_log / 2) + z_mu
+        det=1e-10
+        z  = self.reparameterize(mu,logvar)
+        pi = self.pi_
+        # log_sigma2_c = self.log_sigma2_c
+        # mu_c = self.mu_c
+        # gamma_c = torch.exp(torch.log(pi.unsqueeze(0))+self.gaussian_pdfs_log(z,mu_c,log_sigma2_c))
+        gamma_c = torch.exp(self.gaussian_pdfs_log(z,self.mu_c,self.log_sigma2_c))+det
+        print('gamma_c:',gamma_c)
+        gamma_c=gamma_c/(gamma_c.sum(1).view(-1,1))#batch_size*Clusters
+        gamma_c=F.softmax(gamma_c)
+        print('gamma_c,normalized:',gamma_c)
+        print('gamma_c argmax:',torch.argmax(gamma_c,1))
+        print('gamma_c argmax counter:',Counter(torch.argmax(gamma_c,1).tolist()))
+
+        gamma=gamma_c.detach().cpu().numpy()
+        return np.argmax(gamma,axis=1),gamma, z
+
+    def predict_dist(self,mu, logvar):
+        # z_mu, z_sigma2_log, z_ma,z_a_sigma2_log = self.encoder(x,adj)
+        # mu, logvar, mu_a, logvar_a  = self.encoder(x,adj)
+        # z = torch.randn_like(mu) * torch.exp(z_sigma2_log / 2) + z_mu
+        z  = self.reparameterize(mu,logvar)
+        pi = self.pi_
+        log_sigma2_c = self.log_sigma2_c
+        mu_c = self.mu_c
+        # gamma_c = torch.exp(self.gaussian_pdfs_log(z,mu_c,log_sigma2_c))
+
+        # gamma=gamma_c.detach().cpu().numpy()
+
+        gamma=[]
+        for e in range(z.shape[0]):
+            temp_dist=[]
+            for m in range(mu_c.shape[0]):
+                temp_dist.append(F.mse_loss(z[e],mu_c[m]).data)
+            gamma.append(temp_dist)
+
+        return np.argmin(gamma,axis=1),np.array(gamma)
+
+    def plot_tsne(self,dataset,epoch,z,true_label,desp):
+
+        cluster_labels=set(true_label)
+        print(cluster_labels)
+        index_group= [np.array(true_label)==y for y in cluster_labels]
+        colors = cm.tab20(range(len(index_group)))
+
+        tsne = TSNE(n_components=2, init='pca',perplexity=50.0)
+        data = torch.cat([z,self.mu_c.to('cpu')],dim=0).detach().numpy()
+        zs_tsne = tsne.fit_transform(data)
+
+        fig, ax = plt.subplots()
+        cmap = plt.get_cmap("tab10")
+        for index,c in zip(index_group,colors):
+            ax.scatter(zs_tsne[np.ix_(index), 0], zs_tsne[np.ix_(index), 1],color=c,s=2)
+
+        if 'predict' in desp.split():
+            for index,c in enumerate(colors):
+                ax.scatter(zs_tsne[z.shape[0]+index:z.shape[0]+index+1, 0], zs_tsne[z.shape[0]+index:z.shape[0]+index+1, 1],marker='^',color=c,s=40)
+        else:
+            ax.scatter(zs_tsne[z.shape[0]:, 0], zs_tsne[z.shape[0]:, 1],marker='^',color='b',s=40)
+        plt.title(desp)
+        # ax.legend()
+        plt.savefig("{}_{}_tsne_{}.pdf".format(dataset,epoch,desp))
+
+    def gaussian_pdfs_log(self,x,mus,log_sigma2s):
+        G=[]
+        for c in range(self.args.nClusters):
+            G.append(self.gaussian_pdf_log(x,mus[c:c+1,:],log_sigma2s[c:c+1,:]).view(-1,1))
+        return torch.cat(G,1)
+
+
+    @staticmethod
+    def gaussian_pdf_log(x,mu,log_sigma2):
+        return -0.5*(torch.sum(np.log(np.pi*2)+log_sigma2+(x-mu).pow(2)/torch.exp(log_sigma2),1)) # np.pi*2, not square
+
+    def check_parameters(self):
+        for name, param in self.named_parameters():
+            if param.requires_grad:
+                print(name, param.data,param.data.shape)
+    def check_gradient(self):
+        for name, param in self.named_parameters():
+            if param.requires_grad:
+                print('grad: ',name)
+                print(param.grad,param.grad.shape)
diff --git a/model_cluster_choose.py b/model_cluster_choose.py
new file mode 100644
index 0000000..c6367b6
--- /dev/null
+++ b/model_cluster_choose.py
@@ -0,0 +1,705 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.optim import Adam
+from sklearn.mixture import GaussianMixture
+from sklearn.metrics import accuracy_score
+from sklearn.manifold import TSNE
+import matplotlib.pyplot as plt
+import matplotlib.cm as cm
+import numpy as np
+import os
+from tqdm import tqdm
+
+from layers import GraphConvolution, GraphConvolutionSparse, Linear, InnerDecoder, InnerProductDecoder
+from utils import cluster_acc
+
+from utils_smiles import *
+from estimators import estimate_mutual_information
+from collections import Counter
+
+class GCNModelAE(nn.Module):
+    def __init__(self, input_feat_dim, n_nodes, hidden_dim1, hidden_dim2, dropout,args):
+        super(GCNModelAE, self).__init__()
+
+        self.args = args
+        self.gc1 = GraphConvolutionSparse(input_feat_dim, hidden_dim1, dropout, act=torch.relu)
+        self.gc2 = GraphConvolution(hidden_dim1, hidden_dim2, dropout, act=lambda x: x)
+        self.dc = InnerProductDecoder(dropout, act=lambda x: x)
+        # self.dc = InnerDecoder(dropout, act=lambda x: x)
+
+    def forward(self, x, adj):
+        z = self.gc1(x,adj)
+        z = self.gc2(z,adj)
+        return self.dc(z),z,None
+
+
+    def loss(self,pred_adj,labels, n_nodes, n_features, norm, pos_weight,L=1):
+
+        cost = norm * F.binary_cross_entropy_with_logits(pred_adj, labels,pos_weight = pos_weight)
+        return cost,
+
+    def check_parameters(self):
+        for name, param in self.named_parameters():
+            if param.requires_grad:
+                print(name, param.data,param.data.shape)
+
+class GCNModelVAE(nn.Module):
+    def __init__(self, input_feat_dim, n_nodes, hidden_dim1, hidden_dim2, dropout,args):
+        super(GCNModelVAE, self).__init__()
+
+        self.args = args
+        self.gc1 = GraphConvolutionSparse(input_feat_dim, hidden_dim1, dropout, act=torch.relu)
+        self.gc2 = GraphConvolution(hidden_dim1, hidden_dim2, dropout, act=lambda x: x)
+        self.gc3 = GraphConvolution(hidden_dim1, hidden_dim2, dropout, act=lambda x: x)
+        self.dc = InnerProductDecoder(dropout, act=lambda x: x)
+        # self.dc = InnerDecoder(dropout, act=lambda x: x)
+
+
+    def encoder(self, x, adj):
+        hidden1 = self.gc1(x, adj)
+        return self.gc2(hidden1, adj), self.gc3(hidden1, adj)
+
+    def decoder(self,mu,logvar):
+
+        z_u = self.reparameterize(mu, logvar)
+
+        return self.dc(z_u)
+
+    def reparameterize(self, mu, logvar):
+        std = torch.exp(logvar)
+        eps = torch.randn_like(std)
+        return eps.mul(std).add_(mu)
+
+        # if self.training:
+            # std = torch.exp(logvar)
+            # eps = torch.randn_like(std)
+            # return eps.mul(std).add_(mu)
+        # else:
+            # return mu
+
+    def forward(self, x, adj):
+
+        mu, logvar = self.encoder(x, adj)
+        z_u = self.reparameterize(mu, logvar)
+        # z_a = self.reparameterize(mu_a,logvar_a)
+        return self.dc(z_u),mu, logvar
+
+
+    def loss(self,x,adj,labels, n_nodes, n_features, norm, pos_weight,L=1):
+
+        det=1e-10
+        norm_u = norm
+        pos_weight_u= pos_weight
+
+        L_rec_u=0
+
+        mu, logvar = self.encoder(x, adj)
+        # z_mu, z_sigma2_log = self.encoder(x)
+        for l in range(L):
+
+            pred_adj = self.decoder(mu,logvar)
+
+            cost_u = norm * F.binary_cross_entropy_with_logits(pred_adj, labels ,pos_weight = pos_weight)
+
+            L_rec_u += cost_u
+
+        L_rec_u/=L
+
+        KLD = -0.5 / n_nodes * torch.mean(torch.sum(1 + 2 * logvar - mu.pow(2) - logvar.exp().pow(2),1))
+        return L_rec_u, KLD
+
+
+    def check_parameters(self):
+        for name, param in self.named_parameters():
+            if param.requires_grad:
+                print(name, param.data,param.data.shape)
+
+
+class GCNModelVAECD(nn.Module):
+    def __init__(self, input_feat_dim, n_nodes, hidden_dim1, hidden_dim2, dropout,args):
+        super(GCNModelVAECD, self).__init__()
+
+        self.args = args
+        self.gc1 = GraphConvolutionSparse(input_feat_dim, hidden_dim1, dropout, act=torch.relu)
+        self.gc2 = GraphConvolution(hidden_dim1, hidden_dim2, dropout, act=lambda x: x)
+        self.gc3 = GraphConvolution(hidden_dim1, hidden_dim2, dropout, act=lambda x: x)
+        self.dc = InnerProductDecoder(dropout, act=lambda x: x)
+        # self.dc = InnerDecoder(dropout, act=lambda x: x)
+
+        #for embedding attributes/features
+        # self.linear_a1= Linear(n_nodes, hidden_dim1, act = torch.tanh,sparse_inputs=True) # the input dim is the number of nodes
+        # self.linear_a2= Linear(hidden_dim1, hidden_dim2, act = lambda x:x)
+        # self.linear_a3= Linear(hidden_dim1, hidden_dim2, act = lambda x:x)
+
+
+        self.pi_=nn.Parameter(torch.FloatTensor(args.nClusters,).fill_(1)/args.nClusters,requires_grad=True)
+        self.mu_c=nn.Parameter(torch.randn(args.nClusters,hidden_dim2),requires_grad=True)
+        self.log_sigma2_c=nn.Parameter(torch.randn(args.nClusters,hidden_dim2),requires_grad=True)
+
+    def encoder(self, x, adj):
+        hidden1 = self.gc1(x, adj)
+        # hidden_a1 = self.linear_a1(x.t()) # transpose the input feature matrix
+
+        return self.gc2(hidden1, adj), self.gc3(hidden1, adj)
+
+    def decoder(self,mu,logvar):
+
+        z_u = self.reparameterize(mu, logvar)
+        # z_a = self.reparameterize(mu_a,logvar_a)
+
+        return self.dc(z_u)
+
+    def reparameterize(self, mu, logvar):
+        if self.training:
+            std = torch.exp(logvar)
+            eps = torch.randn_like(std)
+            return eps.mul(std).add_(mu)
+        else:
+            return mu
+
+    def forward(self, x, adj):
+
+        mu, logvar = self.encoder(x, adj)
+        z_u = self.reparameterize(mu, logvar)
+        # z_a = self.reparameterize(mu_a,logvar_a)
+        return self.dc(z_u),mu, logvar
+
+
+    def loss(self,x,adj,labels, n_nodes, n_features, norm, pos_weight,L=1):
+
+        det=1e-10
+        norm_u = norm
+        pos_weight_u= pos_weight
+
+        L_rec_u=0
+
+        mu, logvar = self.encoder(x, adj)
+        hidden_dim2 = mu.shape[1]
+
+        # z_mu, z_sigma2_log = self.encoder(x)
+        for l in range(L):
+
+            # z=torch.randn_like(z_mu)*torch.exp(z_sigma2_log/2)+z_mu
+            pred_adj = self.decoder(mu,logvar)
+            # L_rec+=F.binary_cross_entropy(x_pro,x)
+
+            # cost_u = norm * F.binary_cross_entropy_with_logits(pred_adj, labels_sub_u,pos_weight = pos_weight)
+            cost_u = norm * F.binary_cross_entropy_with_logits(pred_adj, labels ,pos_weight = pos_weight)
+            # cost_a = norm_a * F.binary_cross_entropy_with_logits(pred_x, labels_sub_a, pos_weight = pos_weight_a)
+            # cost_a =torch.Tensor(1).fill_(0)
+
+            L_rec_u += cost_u
+            # L_rec_a += cost_a
+
+        L_rec_u/=L
+        # L_rec_a/=L
+
+        # z_a = self.reparameterize(mu_a,logvar_a)
+        # KLD_a = (0.5 / n_features) * torch.mean(torch.sum(-1 - 2 * logvar_a + mu_a.pow(2) + logvar_a.exp().pow(2), 1))
+        # KLD_a =torch.Tensor(1).fill_(0)
+
+        # Loss=L_rec*x.size(1)
+
+
+        self.pi_.data = (self.pi_/self.pi_.sum()).data
+        # log_sigma2_c=self.log_sigma2_c
+        # mu_c=self.mu_c
+
+        # z = torch.randn_like(z_mu) * torch.exp(z_sigma2_log / 2) + z_mu
+        z = self.reparameterize(mu,logvar)
+
+        gamma_c=torch.exp(torch.log(self.pi_.unsqueeze(0))+self.gaussian_pdfs_log(z,self.mu_c,self.log_sigma2_c))+det
+        gamma_c = F.softmax(gamma_c) # is softmax a good way?
+
+        gamma_c=gamma_c/(gamma_c.sum(1).view(-1,1)) #shape: batch_size*Clusters
+        self.pi_.data = gamma_c.mean(0).data # prior need to be re-normalized? In GMM, prior is based on gamma_c:https://brilliant.org/wiki/gaussian-mixture-model/
+
+        # KLD_u_c=(0.5 / n_nodes)*torch.mean(torch.sum(gamma_c*torch.sum(self.log_sigma2_c.unsqueeze(0)+\
+            # torch.exp(2*logvar.unsqueeze(1)-self.log_sigma2_c.unsqueeze(0))+\
+            # (mu.unsqueeze(1)-self.mu_c.unsqueeze(0)).pow(2)/torch.exp(self.log_sigma2_c.unsqueeze(0)),2),1))
+
+        # KLD_u_c-= (0.5/n_nodes)*torch.mean(torch.sum(1+2*logvar,1))
+        # gamma_loss = (1 / self.args.nClusters) * torch.mean(torch.sum(gamma_c*torch.log(gamma_c/self.pi_.unsqueeze(0)),1)) - (0.5 / hidden_dim2)*torch.mean(torch.sum(1+2*logvar,1))
+
+        KLD_u_c=-(0.5/n_nodes)*torch.mean(torch.sum(gamma_c*torch.sum(-1+self.log_sigma2_c.unsqueeze(0)-2*logvar.unsqueeze(1)+
+            torch.exp(2*logvar.unsqueeze(1)-self.log_sigma2_c.unsqueeze(0))+
+            (mu.unsqueeze(1)-self.mu_c.unsqueeze(0)).pow(2)/torch.exp(self.log_sigma2_c.unsqueeze(0)),2),1))
+
+        gamma_loss = -(1 / self.args.nClusters) * torch.mean(torch.sum(gamma_c*torch.log(gamma_c/self.pi_.unsqueeze(0)),1))
+
+        return L_rec_u,-KLD_u_c,-gamma_loss
+
+    def pre_train(self,x,adj,Y,pre_epoch=50):
+        '''
+        This function is used to initialize  cluster paramters: pi_, mu_c, log_sigma2_c.
+        -------------
+        paramters:
+        x: is the feature matrix of graph G.
+        adj: is the adjacent matrix of graph G.
+        Y: is the class label for each node in graph G.
+        '''
+
+        if  not os.path.exists('./pretrain_model_{}.pk'.format(self.args.dataset)):
+
+            Loss=nn.MSELoss()
+            opti=Adam(self.parameters()) #all paramters in model
+
+            print('Pretraining......')
+            # epoch_bar=tqdm(range(pre_epoch))
+            # for _ in epoch_bar:
+            for _ in range(pre_epoch):
+
+                self.train()
+                L=0
+                mu, logvar  = self.encoder(x,adj)
+                pred_adj = self.decoder(mu,logvar)
+
+                loss=  Loss(pred_adj,adj.to_dense())
+
+                L+=loss.detach().cpu().numpy()
+
+                opti.zero_grad()
+                loss.backward()
+                opti.step()
+
+                # epoch_bar.write('L2={:.4f}'.format(L))
+                print('L2={:.4f}'.format(L))
+
+            self.gc2.load_state_dict(self.gc3.state_dict())
+            # self.linear_a2.load_state_dict(self.linear_a3.state_dict())
+
+            with torch.no_grad():
+                mu, logvar  = self.encoder(x,adj)
+                assert F.mse_loss(mu, logvar) == 0
+                # assert F.mse_loss(mu_a, logvar_a) == 0
+                Z = mu.data.numpy()
+
+
+            gmm = GaussianMixture(n_components=self.args.nClusters, covariance_type='diag')
+
+            pre = gmm.fit_predict(Z)
+            print('Acc={:.4f}%'.format(cluster_acc(pre, Y)[0] * 100))
+
+            self.pi_.data = torch.from_numpy(gmm.weights_).float()
+            self.mu_c.data = torch.from_numpy(gmm.means_).float()
+            self.log_sigma2_c.data = torch.log(torch.from_numpy(gmm.covariances_).float())
+
+            torch.save(self.state_dict(), './pretrain_model_{}.pk'.format(self.args.dataset))
+        else:
+            self.load_state_dict(torch.load('./pretrain_model_{}.pk'.format(self.args.dataset)))
+
+    def predict(self,mu, logvar):
+        # z_mu, z_sigma2_log, z_ma,z_a_sigma2_log = self.encoder(x,adj)
+        # mu, logvar, mu_a, logvar_a  = self.encoder(x,adj)
+        # z = torch.randn_like(mu) * torch.exp(logvar) + mu
+        z  = self.reparameterize(mu,logvar)
+        pi = self.pi_
+        log_sigma2_c = self.log_sigma2_c
+        mu_c = self.mu_c
+        gamma_c = torch.exp(torch.log(pi.unsqueeze(0))+self.gaussian_pdfs_log(z,mu_c,log_sigma2_c))
+
+        gamma=gamma_c.detach().cpu().numpy()
+
+        return np.argmax(gamma,axis=1),gamma
+
+
+    def gaussian_pdfs_log(self,x,mus,log_sigma2s):
+        G=[]
+        for c in range(self.args.nClusters):
+            G.append(self.gaussian_pdf_log(x,mus[c:c+1,:],log_sigma2s[c:c+1,:]).view(-1,1))
+        return torch.cat(G,1)
+
+
+    @staticmethod
+    def gaussian_pdf_log(x,mu,log_sigma2):
+        return -0.5*(torch.sum(np.log(np.pi*2)+log_sigma2+(x-mu).pow(2)/torch.exp(log_sigma2),1))
+
+    def check_parameters(self):
+        for name, param in self.named_parameters():
+            if param.requires_grad:
+                print(name, param.data,param.data.shape)
+
+class GCNModelVAECE(nn.Module):
+    def __init__(self, input_feat_dim, n_nodes, hidden_dim1, hidden_dim2, dropout,args):
+        super(GCNModelVAECE, self).__init__()
+
+
+        self.args = args
+        self.gc1 = GraphConvolutionSparse(input_feat_dim, hidden_dim1, dropout, act=torch.relu)
+        self.gc2 = GraphConvolution(hidden_dim1, hidden_dim2, dropout, act=lambda x: x)
+        self.gc3 = GraphConvolution(hidden_dim1, hidden_dim2, dropout, act=lambda x: x)
+        # self.dc = InnerProductDecoder(dropout, act=lambda x: x)
+        self.dc = InnerDecoder(dropout, act=lambda x: x)
+
+        #for embedding attributes/features
+        self.linear_a1= Linear(n_nodes, hidden_dim1, act = torch.tanh,sparse_inputs=True) # the input dim is the number of nodes
+        self.linear_a2= Linear(hidden_dim1, hidden_dim2, act = lambda x:x)
+        self.linear_a3= Linear(hidden_dim1, hidden_dim2, act = lambda x:x)
+
+        #modularity layer
+        self.modulairty_layer = Linear(hidden_dim2,args.nClusters,act=torch.relu)
+        self.cluster_choose= Linear(hidden_dim2,args.nClusters,act=torch.relu)
+
+
+        self.pi_=nn.Parameter(torch.FloatTensor(args.nClusters,).fill_(1)/args.nClusters,requires_grad=True)
+        self.mu_c=nn.Parameter(torch.FloatTensor(args.nClusters,hidden_dim2).fill_(0.00),requires_grad=True)
+        self.log_sigma2_c=nn.Parameter(torch.FloatTensor(args.nClusters,hidden_dim2).fill_(0.0),requires_grad=False)
+
+        torch.nn.init.xavier_normal_(self.mu_c)
+        # torch.nn.init.xavier_normal_(self.log_sigma2_c)
+
+        # calculate mi
+
+        # critic_params = {'dim_x': x.shape[1],'dim_y':y.shape[1],'layers': 2,'embed_dim': 32,'hidden_dim': 64,'activation': 'relu',}
+        # self.critic_structure = ConcatCritic(hidden_dim2,n_nodes,256,3,'relu',rho=None,)
+        # self.critic_feature = ConcatCritic(hidden_dim2,input_feat_dim,256,3,'relu',rho=None,)
+
+    def encoder(self, x, adj):
+        hidden1 = self.gc1(x, adj)
+        hidden_a1 = self.linear_a1(x.t()) # transpose the input feature matrix
+        return self.gc2(hidden1, adj), self.gc3(hidden1, adj), self.linear_a2(hidden_a1),self.linear_a3(hidden_a1)
+
+    def decoder(self,mu,mu_a,logvar,logvar_a):
+
+        z_u = self.reparameterize(mu, logvar)
+        z_a = self.reparameterize(mu_a,logvar_a)
+        return self.dc((z_u,z_a))
+
+    def reparameterize(self, mu, logvar):
+        if self.training:
+            std = torch.exp(logvar)
+            eps = torch.randn_like(std)
+            return eps.mul(std).add_(mu)
+        else:
+            return mu
+
+    def forward(self, x, adj):
+
+        mu, logvar, mu_a, logvar_a = self.encoder(x, adj)
+        z_u = self.reparameterize(mu, logvar)
+        z_a = self.reparameterize(mu_a,logvar_a)
+        return self.dc((z_u,z_a)),mu, logvar, mu_a, logvar_a
+
+    def modularity_loss(self, z,adj):
+
+        adj = adj.to_dense()
+        H = self.modulairty_layer(z)
+        assert H.shape[0]==z.shape[0]
+
+        n = torch.tensor(1.0*z.shape[0])
+
+        H_norm = n.sqrt()*H.sqrt()/(H.sqrt().sum())
+        print("H_norm shape",H_norm.shape)
+        print("H_norm ",H_norm)
+        m = (adj-torch.eye(adj.shape[0])).sum()/2
+        D = (adj-torch.eye(adj.shape[0])).sum(1) # the degree of nodes, adj includes self loop
+        B = (adj-torch.eye(adj.shape[0]))-torch.matmul(D.view(-1,1),D.view(1,-1))/(2*m) # modularity matrix
+        mod_loss=torch.trace(torch.matmul(torch.matmul(H_norm.t(),B),H_norm)/(4*m))
+        print("mod_loss",mod_loss)
+
+        return mod_loss
+
+    def dist(self,x):
+        # x = x/torch.norm(x,2,dim=1).view(-1,1)
+        assert len(x.size()) == 2
+        norm = (x ** 2).sum(1).view(-1, 1)
+        dn = (norm + norm.view(1, -1)) - 2.0 * (x @ x.t())
+        return torch.sum(torch.relu(dn).sqrt())
+
+    def mi_loss(self,z,x,a):
+        # critic_params = {'dim_x': x.shape[1],'dim_y':y.shape[1],'layers': 2,'embed_dim': 32,'hidden_dim': 64,'activation': 'relu',}
+        # critic = ConcatCritic(rho=None,**critic_params)
+        indice = torch.randperm(len(z))[0:50]
+        # mi_x = estimate_mutual_information('dv',z[indice],x[indice],self.critic_structure)
+        mi_a = estimate_mutual_information('js',z[indice],a[indice],self.critic_feature)
+        return mi_a
+
+    def change_cluster_grad_false(self):
+        for name, param in self.named_parameters():
+            if name in ['pi_','mu_c','log_sigma2_c']:
+                param.requires_grad=False
+
+    def change_cluster_grad_true(self):
+        for name, param in self.named_parameters():
+            if name in ['pi_','mu_c','log_sigma2_c']:
+                param.requires_grad=True
+
+
+    def change_nn_grad_false(self):
+        for name, param in self.named_parameters():
+            if name not in ['pi_','mu_c','log_sigma2_c']:
+                param.requires_grad=False
+
+    def change_nn_grad_true(self):
+        for name, param in self.named_parameters():
+            if name not in ['pi_','mu_c','log_sigma2_c']:
+                param.requires_grad=True
+
+    def loss(self,x,adj,labels, n_nodes, n_features, norm, pos_weight,L=1):
+
+        det=1e-10
+        labels_sub_u, labels_sub_a = labels
+        norm_u, norm_a = norm
+        pos_weight_u, pos_weight_a = pos_weight
+
+        L_rec_u=0
+        L_rec_a=0
+
+        mi=0
+
+        mu, logvar, mu_a, logvar_a = self.encoder(x, adj)
+
+        # mutual information loss
+
+        # z_mu, z_sigma2_log = self.encoder(x)
+        # mi_a = self.mi_loss(mu,adj.to_dense(),x.to_dense())
+        for l in range(L):
+
+            # z=torch.randn_like(z_mu)*torch.exp(z_sigma2_log/2)+z_mu
+            pred_adj, pred_x = self.decoder(mu,mu_a,logvar,logvar_a)
+            # L_rec+=F.binary_cross_entropy(x_pro,x)
+
+            cost_u = norm_u * F.binary_cross_entropy_with_logits(pred_adj, labels_sub_u, pos_weight = pos_weight_u)
+            cost_a = norm_a * F.binary_cross_entropy_with_logits(pred_x, labels_sub_a, pos_weight = pos_weight_a)
+            # cost_a =torch.Tensor(1).fill_(0)
+
+            L_rec_u += cost_u
+            L_rec_a += cost_a
+
+
+        L_rec_u/=L
+        L_rec_a/=L
+
+        # z_a = self.reparameterize(mu_a,logvar_a)
+        # KLD_a = (0.5 / n_features) * torch.mean(torch.sum(-1 - 2 * logvar_a + mu_a.pow(2) + logvar_a.exp().pow(2), 1))
+        KLD_a = -(0.5 / n_features) * torch.mean(torch.sum(-1 - 2 * logvar_a + mu_a.pow(2) + logvar_a.exp().pow(2), 1))
+        # KLD_a =torch.Tensor(1).fill_(0)
+
+        # Loss=L_rec*x.size(1)
+
+
+        # log_sigma2_c=self.log_sigma2_c
+        # mu_c=self.mu_c
+
+        # z = torch.randn_like(z_mu) * torch.exp(z_sigma2_log / 2) + z_mu
+        z = self.reparameterize(mu,logvar)
+
+        # mod_loss=self.modularity_loss(z,adj)
+        # gamma_c=torch.exp(torch.log(self.pi_.unsqueeze(0))+self.gaussian_pdfs_log(z,self.mu_c,self.log_sigma2_c))+det
+        # gamma_c=torch.exp(self.gaussian_pdfs_log(z,self.mu_c,self.log_sigma2_c))+det
+        gamma_c  = self.cluster_choose(self.reparameterize(mu,logvar))
+        # print('gamma_c:',gamma_c)
+
+        # gamma_c=gamma_c/(gamma_c.sum(1).view(-1,1))#batch_size*Clusters
+        gamma_c=F.softmax(gamma_c)
+        # print('gamma_c normalized:',gamma_c)
+        # print('gamma_c argmax:',torch.argmax(gamma_c,1))
+        print('gamma_c counter:',Counter(torch.argmax(gamma_c,1).tolist()))
+
+
+        # self.pi_.data = (self.pi_/self.pi_.sum()).data # prior need to be re-normalized? In GMM, prior is based on gamma_c:https://brilliant.org/wiki/gaussian-mixture-model/
+        # self.pi_.data = gamma_c.mean(0).data # prior need to be re-normalized? In GMM, prior is based on gamma_c:https://brilliant.org/wiki/gaussian-mixture-model/
+
+        KLD_u_c=-(0.5/n_nodes)*torch.mean(torch.sum(gamma_c*torch.sum(-1+self.log_sigma2_c.unsqueeze(0)-2*logvar.unsqueeze(1)+torch.exp(2*logvar.unsqueeze(1)-self.log_sigma2_c.unsqueeze(0))+(mu.unsqueeze(1)-self.mu_c.unsqueeze(0)).pow(2)/torch.exp(self.log_sigma2_c.unsqueeze(0)),2),1))
+        # KLD_u_c=-(0.5/n_nodes)*torch.mean(torch.sum(gamma_c*torch.sum(-1-2*logvar.unsqueeze(1)+torch.exp(2*logvar.unsqueeze(1))+(mu.unsqueeze(1)-self.mu_c.unsqueeze(0)).pow(2),2),1))
+        # temp_kld=-(0.5/n_nodes)*torch.sum((mu.unsqueeze(1)-self.mu_c.unsqueeze(0)).pow(2),2)
+
+        # KLD_u_c_test=-(0.5/n_nodes)*F.mse_loss(mu.unsqueeze(1),self.mu_c.unsqueeze(0),reduction='none')
+        # print('kld_u_c_test:',KLD_u_c_test.sum(2))
+
+
+        # KLD_u_c=-(0.5/n_nodes)*F.mse_loss(mu.unsqueeze(1),self.mu_c.unsqueeze(0))
+
+        # KLD_u_c=(0.5 / n_nodes)*torch.mean(torch.sum(gamma_c*torch.sum(self.log_sigma2_c.unsqueeze(0)+\
+            # torch.exp(2*logvar.unsqueeze(1)-self.log_sigma2_c.unsqueeze(0))+\
+            # (mu.unsqueeze(1)-self.mu_c.unsqueeze(0)).pow(2)/torch.exp(self.log_sigma2_c.unsqueeze(0)),2),1))
+
+        mutual_dist = (1/(self.args.nClusters**2))*self.dist(self.mu_c)
+
+        # gamma_loss=-(1/self.args.nClusters)*torch.mean(torch.sum(gamma_c*torch.log(gamma_c),1))
+        # gamma_loss = (1 / self.args.nClusters) * torch.mean(torch.sum(gamma_c*torch.log(gamma_c),1)) - (0.5 / self.args.hid_dim)*torch.mean(torch.sum(1+2*logvar,1))
+        gamma_loss = -(1 / self.args.nClusters) * torch.mean(torch.sum(gamma_c*torch.log(gamma_c/self.pi_.unsqueeze(0)),1))
+        # gamma_loss = (1 / self.args.nClusters) * torch.mean(torch.sum(gamma_c*torch.log(gamma_c/self.pi_.unsqueeze(0)),1)) - (0.5 / self.args.hid_dim)*torch.mean(torch.sum(1+2*logvar,1))
+
+
+        # return L_rec_u , L_rec_a , -KLD_u_c ,-KLD_a
+        return L_rec_u , 0.1*L_rec_a , -30*KLD_u_c ,-KLD_a , -gamma_loss, -0.05*mutual_dist
+        # return L_rec_u , L_rec_a , -KLD_u_c ,-KLD_a , -gamma_loss,-mi_a
+        # return L_rec_u + L_rec_a + KLD_u_c + KLD_a + gamma_loss
+
+
+    def pre_train(self,x,adj,Y,pre_epoch=22):
+        '''
+        This function is used to initialize  cluster paramters: pi_, mu_c, log_sigma2_c.
+        -------------
+        paramters:
+        x: is the feature matrix of graph G.
+        adj: is the adjacent matrix of graph G.
+        Y: is the class label for each node in graph G.
+        '''
+
+        if not os.path.exists('./pretrain_model_{}_{}.pk'.format(self.args.dataset,pre_epoch)):
+
+            Loss=nn.MSELoss()
+            opti=Adam(self.parameters()) #all paramters in model
+
+            print('Pretraining......')
+            # epoch_bar=tqdm(range(pre_epoch))
+            # for _ in epoch_bar:
+            for _ in range(pre_epoch):
+
+                self.train()
+                L=0
+                mu, logvar, mu_a, logvar_a  = self.encoder(x,adj)
+                pred_adj, pred_x = self.decoder(mu,mu_a,logvar,logvar_a)
+
+                loss=  Loss(pred_x,x) + Loss(pred_adj,adj)
+
+                L+=loss.detach().cpu().numpy()
+
+                opti.zero_grad()
+                loss.backward()
+                opti.step()
+
+                # epoch_bar.write('L2={:.4f}'.format(L))
+                print('L2={:.4f}'.format(L))
+
+            # self.gc2.load_state_dict(self.gc3.state_dict())
+            # self.linear_a2.load_state_dict(self.linear_a3.state_dict())
+
+
+            # with torch.no_grad():
+                # mu, logvar, mu_a, logvar_a  = self.encoder(x,adj)
+                # assert F.mse_loss(mu, logvar) == 0
+                # assert F.mse_loss(mu_a, logvar_a) == 0
+                # Z = mu.data.numpy()
+
+            mu, logvar, mu_a, logvar_a  = self.encoder(x,adj)
+            Z  = self.reparameterize(mu,logvar)
+
+            gmm = GaussianMixture(n_components=self.args.nClusters, covariance_type='diag')
+
+            pre = gmm.fit_predict(Z.cpu().detach().numpy())
+            print('Acc={:.4f}%'.format(cluster_acc(pre, Y)[0] * 100))
+
+            self.pi_.data = torch.from_numpy(gmm.weights_).float()
+            self.mu_c.data = torch.from_numpy(gmm.means_).float()
+            self.log_sigma2_c.data = torch.log(torch.from_numpy(gmm.covariances_).float())
+
+            torch.save(self.state_dict(), './pretrain_model_{}_{}.pk'.format(self.args.dataset,pre_epoch))
+        else:
+            self.load_state_dict(torch.load('./pretrain_model_{}_{}.pk'.format(self.args.dataset,pre_epoch)))
+
+    def predict_nn(self,mu,logvar):
+        z  = self.reparameterize(mu,logvar)
+        gamma_c  = self.cluster_choose(self.reparameterize(mu,logvar))
+
+        print('gamma_c,normalized:',gamma_c)
+        print('gamma_c argmax:',torch.argmax(gamma_c,1))
+        print('gamma_c argmax counter:',Counter(torch.argmax(gamma_c,1).tolist()))
+
+        gamma=gamma_c.detach().cpu().numpy()
+
+
+        return np.argmax(gamma,axis=1),gamma, z
+
+
+
+    def predict(self,mu, logvar):
+        # z_mu, z_sigma2_log, z_ma,z_a_sigma2_log = self.encoder(x,adj)
+        # mu, logvar, mu_a, logvar_a  = self.encoder(x,adj)
+        # z = torch.randn_like(mu) * torch.exp(z_sigma2_log / 2) + z_mu
+        det=1e-10
+        z  = self.reparameterize(mu,logvar)
+        pi = self.pi_
+        # log_sigma2_c = self.log_sigma2_c
+        # mu_c = self.mu_c
+        # gamma_c = torch.exp(torch.log(pi.unsqueeze(0))+self.gaussian_pdfs_log(z,mu_c,log_sigma2_c))
+        gamma_c = torch.exp(self.gaussian_pdfs_log(z,self.mu_c,self.log_sigma2_c))+det
+        print('gamma_c:',gamma_c)
+        gamma_c=gamma_c/(gamma_c.sum(1).view(-1,1))#batch_size*Clusters
+        gamma_c=F.softmax(gamma_c)
+        print('gamma_c,normalized:',gamma_c)
+        print('gamma_c argmax:',torch.argmax(gamma_c,1))
+        print('gamma_c argmax counter:',Counter(torch.argmax(gamma_c,1).tolist()))
+
+        gamma=gamma_c.detach().cpu().numpy()
+
+    def predict_dist(self,mu, logvar):
+        # z_mu, z_sigma2_log, z_ma,z_a_sigma2_log = self.encoder(x,adj)
+        # mu, logvar, mu_a, logvar_a  = self.encoder(x,adj)
+        # z = torch.randn_like(mu) * torch.exp(z_sigma2_log / 2) + z_mu
+        z  = self.reparameterize(mu,logvar)
+        pi = self.pi_
+        log_sigma2_c = self.log_sigma2_c
+        mu_c = self.mu_c
+        # gamma_c = torch.exp(self.gaussian_pdfs_log(z,mu_c,log_sigma2_c))
+
+        # gamma=gamma_c.detach().cpu().numpy()
+
+        gamma=[]
+        for e in range(z.shape[0]):
+            temp_dist=[]
+            for m in range(mu_c.shape[0]):
+                temp_dist.append(F.mse_loss(z[e],mu_c[m]).data)
+            gamma.append(temp_dist)
+
+        return np.argmin(gamma,axis=1),np.array(gamma)
+
+    def plot_tsne(self,dataset,epoch,z,true_label,pred_label):
+
+        tsne = TSNE(n_components=2, init='pca',perplexity=50.0)
+        data = torch.cat([z,self.mu_c.to('cpu')],dim=0).detach().numpy()
+        zs_tsne = tsne.fit_transform(data)
+
+        cluster_labels=set(true_label)
+        print(cluster_labels)
+        index_group= [np.array(true_label)==y for y in cluster_labels]
+        colors = cm.tab20(range(len(index_group)))
+
+        fig, ax = plt.subplots()
+        for index,c in zip(index_group,colors):
+            ax.scatter(zs_tsne[np.ix_(index), 0], zs_tsne[np.ix_(index), 1],color=c,s=2)
+
+        ax.scatter(zs_tsne[z.shape[0]:, 0], zs_tsne[z.shape[0]:, 1],marker='^',color='b',s=40)
+        plt.title('true label')
+        # ax.legend()
+        plt.savefig("./visualization/{}_{}_tsne_{}.pdf".format(dataset,epoch,'true_label'))
+
+        cluster_labels=set(pred_label)
+        print(cluster_labels)
+        index_group= [np.array(pred_label)==y for y in cluster_labels]
+        colors = cm.tab10(range(len(index_group)))
+
+        fig, ax = plt.subplots()
+        for index,c in zip(index_group,colors):
+            ax.scatter(zs_tsne[np.ix_(index), 0], zs_tsne[np.ix_(index), 1],color=c,s=2)
+
+        for index,c in enumerate(colors):
+            ax.scatter(zs_tsne[z.shape[0]+index:z.shape[0]+index+1, 0], zs_tsne[z.shape[0]+index:z.shape[0]+index+1, 1],marker='^',color=c,s=40)
+
+        plt.title('pred label')
+        # ax.legend()
+        plt.savefig("./visualization/{}_{}_tsne_{}.pdf".format(dataset,epoch,'pred_label'))
+
+    def gaussian_pdfs_log(self,x,mus,log_sigma2s):
+        G=[]
+        for c in range(self.args.nClusters):
+            G.append(self.gaussian_pdf_log(x,mus[c:c+1,:],log_sigma2s[c:c+1,:]).view(-1,1))
+        return torch.cat(G,1)
+
+
+    @staticmethod
+    def gaussian_pdf_log(x,mu,log_sigma2):
+        return -0.5*(torch.sum(np.log(np.pi*2)+log_sigma2+(x-mu).pow(2)/torch.exp(log_sigma2),1)) # np.pi*2, not square
+
+    def check_parameters(self):
+        for name, param in self.named_parameters():
+            if param.requires_grad:
+                print(name, param.data,param.data.shape)
+    def check_gradient(self):
+        for name, param in self.named_parameters():
+            if param.requires_grad:
+                print('grad: ',name)
+                print(param.grad,param.grad.shape)
diff --git a/train.py b/train.py
index 689bf94..7873b33 100644
--- a/train.py
+++ b/train.py
@@ -52,10 +52,8 @@ def training(args):
     print("node size:{}, feature size:{}".format(n_nodes,n_features))
 
 
-    adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(sp.csc_matrix(adj_init))
-    print('adj_train sum\n',adj_train.sum()/2)
-    fea_train, train_feas, val_feas, val_feas_false, test_feas, test_feas_false = mask_test_feas(features)
-    print('fea_train shape',fea_train.shape)
+    # adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(adj_init)
+    # fea_train, train_feas, val_feas, val_feas_false, test_feas, test_feas_false = mask_test_feas(features)
 
     features_orig = features
     features_label = torch.FloatTensor(features.toarray())
@@ -77,28 +75,20 @@ def training(args):
     embedding_attr_var_result_file = "result/AGAE_{}_a_sig.emb".format(args.dataset)
 
     # Some preprocessing, get the support matrix, D^{-1/2}\hat{A}D^{-1/2}
-    adj_norm = preprocess_graph(adj_train)
+    adj_norm = preprocess_graph(adj_init)
+    print("graph edge number after normalize adjacent matrix:{}".format(adj_init.sum()/2))
 
-    # pos_weight_u = torch.tensor(float(adj_init.shape[0] * adj_init.shape[0] - adj_init.sum()) / adj_init.sum()) #??
-    # norm_u = adj_init.shape[0] * adj_init.shape[0] / float((adj_init.shape[0] * adj_init.shape[0] - adj_init.sum()) * 2) #??
-    # pos_weight_a = torch.tensor(float(features[2][0] * features[2][1] - len(features[1])) / len(features[1]))
-    # norm_a = features[2][0] * features[2][1] / float((features[2][0] * features[2][1] - len(features[1])) * 2)
+    pos_weight_u = torch.tensor(float(adj_init.shape[0] * adj_init.shape[0] - adj_init.sum()) / adj_init.sum()) #??
+    norm_u = adj_init.shape[0] * adj_init.shape[0] / float((adj_init.shape[0] * adj_init.shape[0] - adj_init.sum()) * 2) #??
+    pos_weight_a = torch.tensor(float(features[2][0] * features[2][1] - len(features[1])) / len(features[1]))
+    norm_a = features[2][0] * features[2][1] / float((features[2][0] * features[2][1] - len(features[1])) * 2)
 
-    pos_weight_u = torch.tensor(float(adj_train.shape[0] * adj_train.shape[0] - adj_train.sum()) / adj_train.sum()) #??
-    norm_u = adj_train.shape[0] * adj_train.shape[0] / float((adj_train.shape[0] * adj_train.shape[0] - adj_train.sum()) * 2) #??
-    pos_weight_a = torch.tensor(float(fea_train.shape[0] * fea_train.shape[1] - (fea_train.sum())) / (fea_train.sum()))
-    norm_a = fea_train.shape[0] * fea_train.shape[0] / float(fea_train.shape[0] * fea_train.shape[1] - fea_train.sum()) * 2
     features_training = sparse_mx_to_torch_sparse_tensor(features_orig)
 
-    print('pos_weight_u,norm_u,pos_weight_a,norm_a',pos_weight_u,norm_u,pos_weight_a,norm_a)
-
-    adj_label = torch.FloatTensor(adj_train.toarray()+sp.eye(adj_init.shape[0])) # add the identity matrix to the adj as label
-
-    fea_train = sparse_mx_to_torch_sparse_tensor(fea_train)
-    adj_train = sparse_mx_to_torch_sparse_tensor(adj_train)
     # clustering pretraining for GMM paramter initialization
     # writer=SummaryWriter('./logs')
 
+    adj_label = torch.FloatTensor(adj_init.toarray()+sp.eye(adj_init.shape[0])) # add the identity matrix to the adj as label
 
     mean_h=[]
     mean_c=[]
@@ -112,11 +102,6 @@ def training(args):
 
     if args.cuda:
         features_training = features_training.to_dense().cuda()
-        fea_train = fea_train.to_dense().cuda()
-        print('fea_train\n',fea_train)
-        adj_train = adj_train.to_dense().cuda()
-        print('adj_train\n',adj_train)
-        print('adj_train sum\n',adj_train.sum())
         adj_norm = adj_norm.to_dense().cuda()
         pos_weight_u = pos_weight_u.cuda()
         pos_weight_a = pos_weight_a.cuda()
@@ -194,8 +179,8 @@ def training(args):
             elif args.model =='gcn_vaece': #gcn with vae for co-embedding of feature and graph
 
 
-                (recovered_u, recovered_a), mu_u, logvar_u, mu_a, logvar_a = model(fea_train,adj_norm)
-                loss_list = model.loss(fea_train,adj_norm,labels = (adj_train, fea_train), n_nodes = n_nodes, n_features = n_features,norm = (norm_u, norm_a), pos_weight = (pos_weight_u, pos_weight_a))
+                (recovered_u, recovered_a), mu_u, logvar_u, mu_a, logvar_a = model(features_training, adj_norm)
+                loss_list = model.loss(features_training,adj_norm,labels = (adj_label, features_label), n_nodes = n_nodes, n_features = n_features,norm = (norm_u, norm_a), pos_weight = (pos_weight_u, pos_weight_a))
                 loss =sum(loss_list)
 
                 if epoch%10 <8:
diff --git a/train_bk_2021_1_7.py b/train_bk_2021_1_7.py
new file mode 100644
index 0000000..5a24c1f
--- /dev/null
+++ b/train_bk_2021_1_7.py
@@ -0,0 +1,359 @@
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import time
+import numpy as np
+import scipy.sparse as sp
+import torch
+from torch import optim
+from torch.autograd import Variable
+from torch.optim.lr_scheduler import StepLR
+from model import GCNModelVAE,GCNModelVAECD,GCNModelAE,GCNModelVAECE
+from utils import preprocess_graph, get_roc_score, sparse_to_tuple,sparse_mx_to_torch_sparse_tensor,cluster_acc,clustering_evaluation, find_motif
+from preprocessing import mask_test_feas,mask_test_edges, load_AN, check_symmetric,load_data
+from tqdm import tqdm
+from tensorboardX import SummaryWriter
+from evaluation import clustering_latent_space
+from collections import Counter
+import itertools
+import random
+
+import warnings
+warnings.simplefilter("ignore")
+
+def training(args):
+
+    print("Using {} dataset".format(args.dataset))
+    # adj_init, features, Y= load_AN(args.dataset)
+    adj_init, features, labels, idx_train, idx_val, idx_test = load_data(args.dataset)
+    Y = np.argmax(labels,1) # labels is in one-hot format
+
+    # Store original adjacency matrix (without diagonal entries) for later
+    adj_init = adj_init- sp.dia_matrix((adj_init.diagonal()[np.newaxis, :], [0]), shape=adj_init.shape)
+    adj_init.eliminate_zeros()
+
+    assert adj_init.diagonal().sum()==0,"adj diagonal sum:{}, should be 0".format(adj_init.diagonal().sum())
+    n_nodes, n_features= features.shape
+    # assert check_symmetric(adj_init).sum()==n_nodes*n_nodes,"adj should be symmetric"
+    print("imported graph edge number (without selfloop):{}".format((adj_init-adj_init.diagonal()).sum()/2))
+
+    # find motif 3 nodes
+
+    # motif_matrix=find_motif(adj_init,args.dataset)
+    # print("find motif")
+
+
+    args.nClusters=len(set(Y))
+    # args.nClusters=1
+    print("cluster number:{}".format(args.nClusters))
+    assert(adj_init.shape[0]==n_nodes)
+
+    print("node size:{}, feature size:{}".format(n_nodes,n_features))
+
+
+    # adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(adj_init)
+    # fea_train, train_feas, val_feas, val_feas_false, test_feas, test_feas_false = mask_test_feas(features)
+
+    features_orig = features
+    features_label = torch.FloatTensor(features.toarray())
+    features = sp.lil_matrix(features)
+
+    features = sparse_to_tuple(features.tocoo())
+
+    features_nonzero = features[1].shape[0]
+
+    print("graph edge number after mask:{}".format(adj_init.sum()/2))
+
+
+
+    # save result to files
+    link_predic_result_file = "result/AGAE_{}.res".format(args.dataset)
+    embedding_node_mean_result_file = "result/AGAE_{}_n_mu.emb".format(args.dataset)
+    embedding_attr_mean_result_file = "result/AGAE_{}_a_mu.emb".format(args.dataset)
+    embedding_node_var_result_file = "result/AGAE_{}_n_sig.emb".format(args.dataset)
+    embedding_attr_var_result_file = "result/AGAE_{}_a_sig.emb".format(args.dataset)
+
+    # Some preprocessing, get the support matrix, D^{-1/2}\hat{A}D^{-1/2}
+    adj_norm = preprocess_graph(adj_init)
+    print("graph edge number after normalize adjacent matrix:{}".format(adj_init.sum()/2))
+
+    pos_weight_u = torch.tensor(float(adj_init.shape[0] * adj_init.shape[0] - adj_init.sum()) / adj_init.sum()) #??
+    norm_u = adj_init.shape[0] * adj_init.shape[0] / float((adj_init.shape[0] * adj_init.shape[0] - adj_init.sum()) * 2) #??
+    pos_weight_a = torch.tensor(float(features[2][0] * features[2][1] - len(features[1])) / len(features[1]))
+    norm_a = features[2][0] * features[2][1] / float((features[2][0] * features[2][1] - len(features[1])) * 2)
+
+    features_training = sparse_mx_to_torch_sparse_tensor(features_orig)
+
+    # clustering pretraining for GMM paramter initialization
+    # writer=SummaryWriter('./logs')
+
+    adj_label = torch.FloatTensor(adj_init.toarray()+sp.eye(adj_init.shape[0])) # add the identity matrix to the adj as label
+
+    mean_h=[]
+    mean_c=[]
+    mean_v=[]
+    mean_ari=[]
+    mean_ami=[]
+    mean_nmi=[]
+    mean_purity=[]
+    mean_accuracy=[]
+
+
+    if args.cuda:
+        features_training = features_training.to_dense().cuda()
+        adj_norm = adj_norm.to_dense().cuda()
+        pos_weight_u = pos_weight_u.cuda()
+        pos_weight_a = pos_weight_a.cuda()
+        adj_label = adj_label.cuda()
+        features_label = features_label.cuda()
+        # idx_train = idx_train.cuda()
+
+        # idx_val = idx_val.cuda()
+        # idx_test = idx_test.cuda()
+
+    features_training, adj_norm = Variable(features_training), Variable(adj_norm)
+    pos_weight_u = Variable(pos_weight_u)
+    pos_weight_a = Variable(pos_weight_a)
+
+    for r in range(args.num_run):
+
+        model = None
+        if args.model == 'gcn_ae':
+                model = GCNModelAE(n_features,n_nodes, args.hidden1, args.hidden2, args.dropout,args)
+        elif args.model == 'gcn_vae':
+                model = GCNModelVAE(n_features,n_nodes, args.hidden1, args.hidden2, args.dropout,args)
+        elif args.model == 'gcn_vaecd':
+                model = GCNModelVAECD(n_features,n_nodes, args.hidden1, args.hidden2, args.dropout,args)
+        elif args.model =='gcn_vaece': #gcn with vae for co-embedding of feature and graph
+                model = GCNModelVAECE(n_features,n_nodes, args.hidden1, args.hidden2, args.dropout,args)
+
+                # using GMM to pretrain the  clustering parameters
+
+        if args.cuda:
+            model.cuda()
+
+        print([i for i in model.named_parameters()])
+
+
+        if args.model == 'gcn_vaecd':
+            params1=[model.gc1.parameters(),model.gc2.parameters(),model.gc3.parameters(),model.dc.parameters()]
+            optimizer1 = optim.Adam(itertools.chain(*params1), lr=args.lr)
+        elif args.model == 'gcn_vaece':
+            params1=[model.gc1.parameters(),model.gc2.parameters(),model.gc3.parameters(),model.dc.parameters(),model.linear_a1.parameters(),model.linear_a2.parameters(),model.linear_a3.parameters()]
+            optimizer1 = optim.Adam(itertools.chain(*params1), lr=args.lr)
+
+            # model.pre_train(features_training,adj_norm,Y,pre_epoch=50)
+
+        optimizer2 = optim.Adam(model.parameters(), lr=args.lr)
+
+        # params2=[model.pi_,model.mu_c,model.log_sigma2_c]
+        # optimizer2 = optim.Adam(itertools.chain(*params2), lr=args.lr)
+
+        hidden_emb_u = None
+        hidden_emb_a = None
+
+        cost_val = []
+        acc_val = []
+        val_roc_score = []
+        lr_s=StepLR(optimizer1,step_size=30,gamma=0.95) # it seems that fix leanring rate is better
+
+        loss_list=None
+        for epoch in range(args.epochs):
+            t = time.time()
+            model.train()
+
+            if args.model =='gcn_vaecd':
+                recovered_u, mu_u, logvar_u = model(features_training, adj_norm)
+                loss_list = model.loss(features_training,adj_norm,labels = adj_label, n_nodes = n_nodes, n_features = n_features,norm = norm_u, pos_weight = pos_weight_u)
+                loss =sum(loss_list)
+
+            elif args.model == 'gcn_ae':
+                recovered_u, mu_u,logvar_u = model(features_training, adj_norm)
+                loss_list = model.loss(recovered_u,labels = adj_label, n_nodes = n_nodes, n_features = n_features,norm = norm_u, pos_weight = pos_weight_u)
+                loss =sum(loss_list)
+            elif args.model == 'gcn_vae':
+                recovered_u, mu_u, logvar_u = model(features_training, adj_norm)
+                loss_list = model.loss(features_training,adj_norm,labels = adj_label, n_nodes = n_nodes, n_features = n_features,norm = norm_u, pos_weight = pos_weight_u)
+                loss =sum(loss_list)
+            elif args.model =='gcn_vaece': #gcn with vae for co-embedding of feature and graph
+
+
+                (recovered_u, recovered_a), mu_u, logvar_u, mu_a, logvar_a = model(features_training, adj_norm)
+                loss_list = model.loss(features_training,adj_norm,labels = (adj_label, features_label), n_nodes = n_nodes, n_features = n_features,norm = (norm_u, norm_a), pos_weight = (pos_weight_u, pos_weight_a))
+                loss =sum(loss_list)
+
+                if epoch%10 <8:
+                    model.change_nn_grad_true()
+                    model.change_cluster_grad_false()
+                    optimizer2.zero_grad()
+                    loss.backward()
+                    optimizer2.step()
+                else:
+                    model.change_nn_grad_false()
+                    model.change_cluster_grad_true()
+                    optimizer2.zero_grad()
+                    loss.backward()
+                    optimizer2.step()
+
+
+
+            lr_s.step()
+
+            # model.check_gradient()
+            # model.check_parameters()
+
+            # if (epoch+1)%50==0:
+                # pre,gamma,z = model.predict(mu_u,logvar_u)
+                # model.plot_tsne(args.dataset,epoch,z,pre,'predict label')
+                # model.plot_tsne(args.dataset,epoch,z,Y,'true label')
+
+
+
+            correct_prediction_u = ((torch.sigmoid(recovered_u.to('cpu'))>=0.5)==adj_label.type(torch.LongTensor))
+            # correct_prediction_a = ((torch.sigmoid(recovered_a)>=0.5).type(torch.LongTensor)==features_label.type(torch.LongTensor)).type(torch.FloatTensor)
+
+            accuracy = torch.mean(correct_prediction_u*1.0)
+
+            # hidden_emb_u = mu_u.data.numpy()
+            # hidden_emb_a = mu_a.data.numpy()
+            # roc_curr, ap_curr = get_roc_score(np.dot(hidden_emb_u,hidden_emb_u.T), adj, val_edges, val_edges_false)
+            # roc_curr_a, ap_curr_a = get_roc_score(np.dot(hidden_emb_u,hidden_emb_a.T), features_orig, val_feas, val_feas_false)
+
+            # val_roc_score.append(roc_curr)
+
+            #clustering#############
+            pre=[]
+            tru=[]
+            gamma = None
+
+
+            tru=Y
+            # model.eval()
+
+            # if args.model == 'vgaecd':
+                    # pre=model.predict(mu_u,logvar_u)
+
+                    # print("True label:{}".format(tru))
+                    # print(Counter(tru))
+                    # print("Predicted label:{}".format(pre))
+                    # print(Counter(pre))
+
+                    # # mc_
+                    # print("cluster means")
+                    # print(model.mu_c.data)
+
+                    # print("cluster prior")
+                    # print(model.pi_.data)
+            # else:
+                    # pre=clustering_latent_space(mu_u.detach().numpy(),tru)
+
+            # writer.add_scalar('loss',loss.item(),epoch)
+            # writer.add_scalar('acc',cluster_acc(pre,tru)[0]*100,epoch)
+            # writer.add_scalar('lr',lr_s.get_last_lr()[0],epoch)
+
+            # print('Loss={:.4f},Clustering_ACC={:.4f}%,LR={:.4f}'.format(loss.item(),cluster_acc(pre,tru)[0]*100,lr_s.get_last_lr()[0]))
+            # H, C, V, ari, ami, nmi, purity  = clustering_evaluation(tru,pre)
+            # print('H:{} C:{} V:{} ari:{} ami:{} nmi:{} purity:{}'.format(H, C, V, ari, ami, nmi, purity))
+
+            #######################
+
+
+            print("Epoch:", '%04d' % (epoch + 1),
+                "LR={:.4f}".format(lr_s.get_last_lr()[0]),
+                  "train_loss_total=", "{:.5f}".format(loss.item()),
+                  "train_loss_parts=", "{}".format([round(l.item(),4) for l in loss_list]),
+                  # "log_lik=", "{:.5f}".format(cost.item()),
+                  # "KL_u=", "{:.5f}".format(KLD_u.item()),
+                  # "KL_a=", "{:.5f}".format(KLD_a.item()),
+                  # "yita_loss=", "{:.5f}".format(yita_loss.item()),
+                  "link_pred_train_acc=", "{:.5f}".format(accuracy.item()),
+                  # "val_edge_roc=", "{:.5f}".format(val_roc_score[-1]),
+                  # "val_edge_ap=", "{:.5f}".format(ap_curr),
+                  # "val_attr_roc=", "{:.5f}".format(roc_curr_a),
+                  # "val_attr_ap=", "{:.5f}".format(ap_curr_a),
+                  "time=", "{:.5f}".format(time.time() - t))
+
+        # model.check_parameters()
+        # z = model.reparameterize(mu_u,logvar_u)
+        # model.plot_tsne(args.dataset,epoch,z,tru,'true label')
+        print("Optimization Finished!")
+
+        # if args.model == 'gcn_vaece':
+            # (recovered_u, recovered_a), mu_u, logvar_u, mu_a, logvar_a = model(features_training, adj_norm)
+        # else:
+            # recovered_u, mu_u, logvar_u = model(features_training, adj_norm)
+
+
+        pre,gamma,z = model.predict(mu_u,logvar_u)
+
+        H, C, V, ari, ami, nmi, purity  = clustering_evaluation(tru,pre)
+        acc = cluster_acc(pre,tru)[0]*100
+        mean_h.append(round(H,4))
+        mean_c.append(round(C,4))
+        mean_v.append(round(V,4))
+        mean_ari.append(round(ari,4))
+        mean_ami.append(round(ami,4))
+        mean_nmi.append(round(nmi,4))
+        mean_purity.append(round(purity,4))
+        mean_accuracy.append(round(acc,4))
+
+    if args.model in ['gcn_vaecd','gcn_vaece']:
+        pre,gamma,z = model.predict(mu_u,logvar_u)
+        model.plot_tsne(args.dataset,epoch,z.to('cpu'),tru,pre)
+    else:
+        pre=clustering_latent_space(mu_u.detach().numpy(),tru)
+
+        # np.save(embedding_node_mean_result_file, mu_u.data.numpy())
+        # np.save(embedding_attr_mean_result_file, mu_a.data.numpy())
+        # np.save(embedding_node_var_result_file, logvar_u.data.numpy())
+        # np.save(embedding_attr_var_result_file, logvar_a.data.numpy())
+
+        # roc_score, ap_score = get_roc_score(np.dot(hidden_emb_u,hidden_emb_u.T), adj, test_edges, test_edges_false)
+        # roc_score_a, ap_score_a = get_roc_score(np.dot(hidden_emb_u,hidden_emb_a.T), features_orig, test_feas, test_feas_false)
+
+        # print('Test edge ROC score: ' + str(roc_score))
+        # print('Test edge AP score: ' + str(ap_score))
+        # print('Test attr ROC score: ' + str(roc_score_a))
+        # print('Test attr AP score: ' + str(ap_score_a))
+
+
+    ###### Report Final Results ######
+    print('Homogeneity:{}\t mean:{}\t std:{}\n'.format(mean_h,round(np.mean(mean_h),4),round(np.std(mean_h),4)))
+    print('Completeness:{}\t mean:{}\t std:{}\n'.format(mean_c,round(np.mean(mean_c),4),round(np.std(mean_c),4)))
+    print('V_measure_score:{}\t mean:{}\t std:{}\n'.format(mean_v,round(np.mean(mean_v),4),round(np.std(mean_v),4)))
+    print('adjusted Rand Score:{}\t mean:{}\t std:{}\n'.format(mean_ari,round(np.mean(mean_ari),4),round(np.std(mean_ari),4)))
+    print('adjusted Mutual Information:{}\t mean:{}\t std:{}\n'.format(mean_ami,round(np.mean(mean_ami),4),round(np.std(mean_ami),4)))
+    print('Normalized Mutual Information:{}\t mean:{}\t std:{}\n'.format(mean_nmi,round(np.mean(mean_nmi),4),round(np.std(mean_nmi),4)))
+    print('Purity:{}\t mean:{}\t std:{}\n'.format(mean_purity,round(np.mean(mean_purity),4),round(np.std(mean_purity),4)))
+    print('Accuracy:{}\t mean:{}\t std:{}\n'.format(mean_accuracy,round(np.mean(mean_accuracy),4),round(np.std(mean_accuracy),4)))
+    print("True label distribution:{}".format(tru))
+    print(Counter(tru))
+    print("Predicted label distribution:{}".format(pre))
+    print(Counter(pre))
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Node clustering")
+    parser.add_argument('--model', type=str, default='gcn_ae', help="models used for clustering: gcn_ae,gcn_vae,gcn_vaecd,gcn_vaece")
+    parser.add_argument('--seed', type=int, default=42, help='Random seed.')
+    parser.add_argument('--epochs', type=int, default=300, help='Number of epochs to train.')
+    parser.add_argument('--hidden1', type=int, default=32, help='Number of units in hidden layer 1.')
+    parser.add_argument('--hidden2', type=int, default=16, help='Number of units in hidden layer 2.')
+    parser.add_argument('--lr', type=float, default=0.002, help='Initial aearning rate.')
+    parser.add_argument('--dropout', type=float, default=0.2, help='Dropout rate (1 - keep probability).')
+    parser.add_argument('--dataset', type=str, default='cora', help='type of dataset.')
+    parser.add_argument('--nClusters',type=int,default=7)
+    parser.add_argument('--num_run',type=int,default=1,help='Number of running times')
+    parser.add_argument('--cuda', action='store_true', default=False, help='Disables CUDA training.')
+    args, unknown = parser.parse_known_args()
+
+    return args
+
+if __name__ == '__main__':
+    args = parse_args()
+    if args.cuda:
+        torch.cuda.set_device(1)
+        torch.cuda.manual_seed(args.seed)
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    training(args)