From 730be5a92987dfc9013422500e9d9ca3aefb6715 Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Tue, 19 Jul 2022 12:03:49 -0400 Subject: [PATCH 01/93] Update initialization.gsql --- .../db_scripts/queries/initialization.gsql | 22 ++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/Graph-Convolutional-Networks/db_scripts/queries/initialization.gsql b/Graph-Convolutional-Networks/db_scripts/queries/initialization.gsql index 78d2b54..0010d88 100644 --- a/Graph-Convolutional-Networks/db_scripts/queries/initialization.gsql +++ b/Graph-Convolutional-Networks/db_scripts/queries/initialization.gsql @@ -1,8 +1,20 @@ CREATE QUERY initialization(/* Parameters here */) FOR GRAPH CitationGraph SYNTAX V2 { - /*This query normalizes the weights on CITE edges according to the outdegrees(CITE) of the source - and target vertices, normalizes the weights on HAS edges according the outdegrees(HAS) of the PAPER vertices, - populates the words attribute with (word indx -> weight), - and splits PAPER vertices into testing, validation and training sets, */ +/* + + Initializes weights on edges and attributes before splitting vertices into + training, validation, and testing sets. + + No inputs + + Starting from all PAPER vertices: + (1) Normalize the weights on CITE edges according to the outdegrees(CITE) + of the source and target vertices + (2 )Normalize the weights on HAS edges according the outdegrees(HAS) of + the PAPER vertices + (3) Populate the words attribute with (word indx -> weight) + (4) Split PAPER vertices into testing, validation and training sets + +*/ MapAccum @word_Map; Papers = {PAPER.*}; @@ -21,4 +33,4 @@ CREATE QUERY initialization(/* Parameters here */) FOR GRAPH CitationGraph SYNTA END, s.words = s.@word_Map; PRINT "initialization finished!"; -} \ No newline at end of file +} From fcb915b92f262d56adaac6c7d2cf1d0ec74efb91 Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Tue, 19 Jul 2022 12:04:41 -0400 Subject: [PATCH 02/93] Update initialization.gsql --- .../db_scripts/queries/initialization.gsql | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Graph-Convolutional-Networks/db_scripts/queries/initialization.gsql b/Graph-Convolutional-Networks/db_scripts/queries/initialization.gsql index 0010d88..656d553 100644 --- a/Graph-Convolutional-Networks/db_scripts/queries/initialization.gsql +++ b/Graph-Convolutional-Networks/db_scripts/queries/initialization.gsql @@ -15,7 +15,8 @@ CREATE QUERY initialization(/* Parameters here */) FOR GRAPH CitationGraph SYNTA (4) Split PAPER vertices into testing, validation and training sets */ - MapAccum @word_Map; + + MapAccum @word_Map; Papers = {PAPER.*}; From e6e2667342602955be2b926f54521858a4e020f8 Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Tue, 19 Jul 2022 12:06:13 -0400 Subject: [PATCH 03/93] Update initialization.gsql --- .../db_scripts/queries/initialization.gsql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Graph-Convolutional-Networks/db_scripts/queries/initialization.gsql b/Graph-Convolutional-Networks/db_scripts/queries/initialization.gsql index 656d553..37a8155 100644 --- a/Graph-Convolutional-Networks/db_scripts/queries/initialization.gsql +++ b/Graph-Convolutional-Networks/db_scripts/queries/initialization.gsql @@ -9,7 +9,7 @@ CREATE QUERY initialization(/* Parameters here */) FOR GRAPH CitationGraph SYNTA Starting from all PAPER vertices: (1) Normalize the weights on CITE edges according to the outdegrees(CITE) of the source and target vertices - (2 )Normalize the weights on HAS edges according the outdegrees(HAS) of + (2) Normalize the weights on HAS edges according the outdegrees(HAS) of the PAPER vertices (3) Populate the words attribute with (word indx -> weight) (4) Split PAPER vertices into testing, validation and training sets From de16aaefc3466d14e81f25745a1c18d7d5573a62 Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Tue, 19 Jul 2022 14:41:06 -0400 Subject: [PATCH 04/93] Update weight_initialization.gsql --- .../queries/weight_initialization.gsql | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/Graph-Convolutional-Networks/db_scripts/queries/weight_initialization.gsql b/Graph-Convolutional-Networks/db_scripts/queries/weight_initialization.gsql index 647653b..fb0e755 100644 --- a/Graph-Convolutional-Networks/db_scripts/queries/weight_initialization.gsql +++ b/Graph-Convolutional-Networks/db_scripts/queries/weight_initialization.gsql @@ -1,6 +1,16 @@ CREATE QUERY weight_initialization() FOR GRAPH CitationGraph SYNTAX V2 { - /*This query initializes the weights for the neural network. - The neural network has 1433 neurons in the input layer, 16 neurons in the hidden layer and 7 neurons in the output layer */ +/* + Initializes the weights for the neural network which has + 1433 neurons in the input layer, 16 neurons in the hidden + layer and 7 neurons in the output layer + + No inputs + + Starting with all WORD vertice: + (1) Calculate weight of all edges between words and layer 0 + (2) Calculate weight of all edges between layer 0 and 1 +*/ + INT input_dim = 1433; INT hidden_dim = 16; INT output_dim = 7; @@ -17,4 +27,4 @@ CREATE QUERY weight_initialization() FOR GRAPH CitationGraph SYNTAX V2 { e.weight = 2*sqrt(6.0/(output_dim+hidden_dim))*(rand_uniform()-0.5); PRINT "weight_initialization finished"; -} \ No newline at end of file +} From bec6f46264848a1385ef1151d84157cb7111cfd5 Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Tue, 19 Jul 2022 14:52:16 -0400 Subject: [PATCH 05/93] Update training.gsql --- .../db_scripts/queries/training.gsql | 254 ++++++++++-------- 1 file changed, 146 insertions(+), 108 deletions(-) diff --git a/Graph-Convolutional-Networks/db_scripts/queries/training.gsql b/Graph-Convolutional-Networks/db_scripts/queries/training.gsql index 0b5a363..2232eda 100644 --- a/Graph-Convolutional-Networks/db_scripts/queries/training.gsql +++ b/Graph-Convolutional-Networks/db_scripts/queries/training.gsql @@ -1,17 +1,53 @@ CREATE QUERY training( DOUBLE alpha0 = 0.4, // initial learning rate + BOOL Adam = True, // enable Adam optimizer. If False, constant learning rate will be used + DOUBLE beta1 = 0.9, // hyperparameter for Adam optimizer + DOUBLE beta2 = 0.999, // hyperparameter for Adam optimizer + DOUBLE keepProb = 1.0, // keep probability for the dropout regularization + DOUBLE lambda = 0.00005, // L2 regularization factor + INT MaxIter = 10) // number of epochs + FOR GRAPH CitationGraph SYNTAX V2 { - /*This query trains the graph convolutional neural network on the training dataset - and evaluates the loss on the validation data and the prediction accuracy on the testing data. */ - ArrayAccum> @@W_0[1433][16]; // 1433 by 16 + +/* + Trains the graph convolutional neural network on the training dataset + and evaluates the loss on the validation data and the prediction accuracy on the + testing data. + + No sample inputs + + Default inputs: + alpha0 = 0.4 + Adam = True + beta1 = 0.9 + beta2 = 0.999 + keepProb = 1.0 + lambda = 0.00005 + MaxIter = 10 + + Start with all WORD vertices: + (1) Load weights into layer 0 and layer 1 + + Using PAPER vertices: + (1) Forward Propagation + (a) Convolve + (b) Hidden layers + (3) Backwards Propgation + (a) Train with loss (Use Adam) + (b) Update weights + + */ + + + ArrayAccum> @@W_0[1433][16]; // 1433 by 16 ArrayAccum> @@W_1[16][7]; // 16 by 7 - ArrayAccum> @@dW_0[1433][16]; // 1433 by 16 + ArrayAccum> @@dW_0[1433][16]; // 1433 by 16 ArrayAccum> @@dW_1[16][7]; // 16 by 7 ArrayAccum> @@VdW_0[1433][16]; // 1433 by 16 ArrayAccum> @@VdW_1[16][7]; // 16 by 7 @@ -21,7 +57,7 @@ FOR GRAPH CitationGraph SYNTAX V2 { SumAccum @@Validation_Loss; SumAccum @@accurate_cnt; - MapAccum @words; + MapAccum @words; ArrayAccum> @zeta_0[16]; ArrayAccum> @zeta_1[7]; ArrayAccum> @dzeta_0[16]; @@ -36,7 +72,8 @@ FOR GRAPH CitationGraph SYNTAX V2 { INT train_cnt = 140; INT val_cnt = 500; INT test_cnt = 1000; - // load weights into @@W_0 and @@W_1 + + // load weights into @@W_0 and @@W_1 WORDs = {WORD.*}; LAYER_0s = SELECT t FROM WORDs:s -(:e)- LAYER_0:t ACCUM @@ -49,110 +86,111 @@ FOR GRAPH CitationGraph SYNTAX V2 { // forward propagation Start = {PAPER.*}; - alpha = alpha0; + alpha = alpha0; -WHILE iter < MaxIter DO - // input -> hidden layer0 - @@Training_Loss = 0; - @@Validation_Loss = 0; - @@accurate_cnt = 0; - @@dW_0.reallocate(1433,16); - @@dW_1.reallocate(16,7); - Start = SELECT s FROM Start:s - POST-ACCUM - s.@zeta_0.reallocate(16), - s.@z_0.reallocate(16), - s.@zeta_1.reallocate(7), - s.@z_1.reallocate(7), - s.@dzeta_0.reallocate(16), - s.@dz_0.reallocate(16), - s.@dzeta_1.reallocate(7), - s.@dz_1.reallocate(7), - s.@words = dropout_SparseVector(s.words, keepProb), - s.@zeta_0 += product_Matrix_SparseVector(@@W_0, s.@words) - ; + WHILE iter < MaxIter DO + // input -> hidden layer0 + @@Training_Loss = 0; + @@Validation_Loss = 0; + @@accurate_cnt = 0; + @@dW_0.reallocate(1433,16); + @@dW_1.reallocate(16,7); + Start = SELECT s FROM Start:s + POST-ACCUM + s.@zeta_0.reallocate(16), + s.@z_0.reallocate(16), + s.@zeta_1.reallocate(7), + s.@z_1.reallocate(7), + s.@dzeta_0.reallocate(16), + s.@dz_0.reallocate(16), + s.@dzeta_1.reallocate(7), + s.@dz_1.reallocate(7), + s.@words = dropout_SparseVector(s.words, keepProb), + s.@zeta_0 += product_Matrix_SparseVector(@@W_0, s.@words) + ; - // convolve - Start = SELECT s FROM Start:s -(CITE:e)- PAPER:t - ACCUM t.@z_0 += product_ArrayAccum_const(s.@zeta_0,e.weight) - POST-ACCUM - s.@z_0 = ReLU_ArrayAccum(s.@z_0), - s.@z_0 = dropout_ArrayAccum(s.@z_0, keepProb), - // hidden layer0 -> hidden layer1 - s.@zeta_1 += product_Matrix_Vector(@@W_1, s.@z_0) - ; - - // convolve - Start = SELECT s FROM Start:s -(CITE:e)- PAPER:t - ACCUM t.@z_1 += product_ArrayAccum_const(s.@zeta_1,e.weight) - POST-ACCUM - s.@y = softmax_ArrayAccum(s.@z_1), - CASE - WHEN s.train THEN - s.@dz_1 = diff_ArrayAccum_oneHotVec(s.@y,s.class_label), - @@Training_Loss += -log(s.@y[s.class_label]) - WHEN s.validation THEN - @@Validation_Loss += -log(s.@y[s.class_label]) - WHEN s.test THEN - INT y_prediction = 0, - DOUBLE maxProb = s.@y[0], - FOREACH i IN RANGE[1,6] DO - IF s.@y[i] > maxProb THEN y_prediction = i, maxProb = s.@y[i] END - END, - IF y_prediction == s.class_label THEN @@accurate_cnt += 1 END - END; + // convolve + Start = SELECT s FROM Start:s -(CITE:e)- PAPER:t + ACCUM t.@z_0 += product_ArrayAccum_const(s.@zeta_0,e.weight) + POST-ACCUM + s.@z_0 = ReLU_ArrayAccum(s.@z_0), + s.@z_0 = dropout_ArrayAccum(s.@z_0, keepProb), + // hidden layer0 -> hidden layer1 + s.@zeta_1 += product_Matrix_Vector(@@W_1, s.@z_0) + ; - // backpropagation - - Training1 = SELECT t FROM Start:s -(CITE:e)- PAPER:t - WHERE s.train - ACCUM t.@dzeta_1 += product_ArrayAccum_const(s.@dz_1,e.weight) - POST-ACCUM - t.@dz_0 += product_Vector_Matrix(@@W_1,t.@dzeta_1), - t.@dz_0 = greater_than_zero_ArrayAccum_ArrayAccum(t.@dz_0, t.@z_0), - FOREACH i IN RANGE[0,15] DO - FOREACH j IN RANGE[0,6] DO - @@dW_1[i][j] += t.@z_0[i]*t.@dzeta_1[j] - END - END - ; - Training0 = SELECT t FROM Training1:s -(CITE:e)- PAPER:t - ACCUM t.@dzeta_0 += product_ArrayAccum_const(s.@dz_0,e.weight) - POST-ACCUM - FOREACH (k,v) IN t.@words DO - FOREACH i IN RANGE[0,15] DO - @@dW_0[k][i] += v*t.@dzeta_0[i] - END - END - ; - @@Training_Loss += lambda*L2Norm_Matrix(@@W_0); - @@dW_0 += product_Matrix_const(@@W_0, lambda); - // @@dW_1 += product_Matrix_const(@@W_1, lambda); // only apply to the first layer - iter = iter + 1; - IF Adam THEN - @@VdW_0 = product_Matrix_const(@@VdW_0,beta1)+product_Matrix_const(@@dW_0,1-beta1); - @@VdW_1 = product_Matrix_const(@@VdW_1,beta1)+product_Matrix_const(@@dW_1,1-beta1); - @@SdW_0 = product_Matrix_const(@@SdW_0,beta2)+product_MatrixSqr_const(@@dW_0,1-beta2); - @@SdW_1 = product_Matrix_const(@@SdW_1,beta2)+product_MatrixSqr_const(@@dW_1,1-beta2); - @@W_0 += AdamGrdient(@@VdW_0,@@SdW_0,iter,alpha,beta1,beta2); - @@W_1 += AdamGrdient(@@VdW_1,@@SdW_1,iter,alpha,beta1,beta2); - ELSE - @@W_0 += product_Matrix_const(@@dW_0, -alpha); - @@W_1 += product_Matrix_const(@@dW_1, -alpha); - END; + // convolve + Start = SELECT s FROM Start:s -(CITE:e)- PAPER:t + ACCUM t.@z_1 += product_ArrayAccum_const(s.@zeta_1,e.weight) + POST-ACCUM + s.@y = softmax_ArrayAccum(s.@z_1), + CASE + WHEN s.train THEN + s.@dz_1 = diff_ArrayAccum_oneHotVec(s.@y,s.class_label), + @@Training_Loss += -log(s.@y[s.class_label]) + WHEN s.validation THEN + @@Validation_Loss += -log(s.@y[s.class_label]) + WHEN s.test THEN + INT y_prediction = 0, + DOUBLE maxProb = s.@y[0], + FOREACH i IN RANGE[1,6] DO + IF s.@y[i] > maxProb THEN y_prediction = i, maxProb = s.@y[i] END + END, + IF y_prediction == s.class_label THEN @@accurate_cnt += 1 END + END; - - PRINT iter,@@Training_Loss/train_cnt AS Training_Loss,@@Validation_Loss/val_cnt AS Validation_Loss,@@accurate_cnt/test_cnt AS accuracy;//,@@train_accurate_cnt,@@val_accurate_cnt; -END; - - // persist @@W_0 and @@W_1 in weights - WORDs = {WORD.*}; - LAYER_0s = SELECT t FROM WORDs:s -(:e)- LAYER_0:t - ACCUM - e.weight = @@W_0[s.indx][t.indx]; + // backpropagation - LAYER_1s = SELECT t FROM LAYER_0s:s -(:e)- LAYER_1:t - ACCUM - e.weight = @@W_1[s.indx][t.indx]; - -} \ No newline at end of file + Training1 = SELECT t FROM Start:s -(CITE:e)- PAPER:t + WHERE s.train + ACCUM t.@dzeta_1 += product_ArrayAccum_const(s.@dz_1,e.weight) + POST-ACCUM + t.@dz_0 += product_Vector_Matrix(@@W_1,t.@dzeta_1), + t.@dz_0 = greater_than_zero_ArrayAccum_ArrayAccum(t.@dz_0, t.@z_0), + FOREACH i IN RANGE[0,15] DO + FOREACH j IN RANGE[0,6] DO + @@dW_1[i][j] += t.@z_0[i]*t.@dzeta_1[j] + END + END + ; + Training0 = SELECT t FROM Training1:s -(CITE:e)- PAPER:t + ACCUM t.@dzeta_0 += product_ArrayAccum_const(s.@dz_0,e.weight) + POST-ACCUM + FOREACH (k,v) IN t.@words DO + FOREACH i IN RANGE[0,15] DO + @@dW_0[k][i] += v*t.@dzeta_0[i] + END + END + ; + @@Training_Loss += lambda*L2Norm_Matrix(@@W_0); + @@dW_0 += product_Matrix_const(@@W_0, lambda); + // @@dW_1 += product_Matrix_const(@@W_1, lambda); // only apply to the first layer + iter = iter + 1; + + IF Adam THEN + @@VdW_0 = product_Matrix_const(@@VdW_0,beta1)+product_Matrix_const(@@dW_0,1-beta1); + @@VdW_1 = product_Matrix_const(@@VdW_1,beta1)+product_Matrix_const(@@dW_1,1-beta1); + @@SdW_0 = product_Matrix_const(@@SdW_0,beta2)+product_MatrixSqr_const(@@dW_0,1-beta2); + @@SdW_1 = product_Matrix_const(@@SdW_1,beta2)+product_MatrixSqr_const(@@dW_1,1-beta2); + @@W_0 += AdamGrdient(@@VdW_0,@@SdW_0,iter,alpha,beta1,beta2); + @@W_1 += AdamGrdient(@@VdW_1,@@SdW_1,iter,alpha,beta1,beta2); + ELSE + @@W_0 += product_Matrix_const(@@dW_0, -alpha); + @@W_1 += product_Matrix_const(@@dW_1, -alpha); + END; + + + PRINT iter,@@Training_Loss/train_cnt AS Training_Loss,@@Validation_Loss/val_cnt AS Validation_Loss,@@accurate_cnt/test_cnt AS accuracy;//,@@train_accurate_cnt,@@val_accurate_cnt; + END; + + // persist @@W_0 and @@W_1 in weights + WORDs = {WORD.*}; + LAYER_0s = SELECT t FROM WORDs:s -(:e)- LAYER_0:t + ACCUM + e.weight = @@W_0[s.indx][t.indx]; + + LAYER_1s = SELECT t FROM LAYER_0s:s -(:e)- LAYER_1:t + ACCUM + e.weight = @@W_1[s.indx][t.indx]; + + } From d8ff974cb81d6449bf86e8415fdc8c0c718a3800 Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Tue, 19 Jul 2022 15:22:23 -0400 Subject: [PATCH 06/93] Update predicting.gsql --- .../db_scripts/queries/predicting.gsql | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/Graph-Convolutional-Networks/db_scripts/queries/predicting.gsql b/Graph-Convolutional-Networks/db_scripts/queries/predicting.gsql index 9bd7665..31a025c 100644 --- a/Graph-Convolutional-Networks/db_scripts/queries/predicting.gsql +++ b/Graph-Convolutional-Networks/db_scripts/queries/predicting.gsql @@ -1,4 +1,20 @@ CREATE QUERY predicting() FOR GRAPH CitationGraph SYNTAX V2 { + +/* + Predict paper predictions from CitationGraph and return + accuracy. + + No inputs + + Start from WORD vertices: + (1) Load weights into layer 0 and 1 + + Using PAPER vertices: + (1) Foward propagation + (a) Convolve + (b) Hidden layers + +*/ ArrayAccum> @@W_0[1433][16]; #1433 by 16 ArrayAccum> @@W_1[16][7]; #16 by 7 SumAccum @@accurate_cnt; @@ -60,4 +76,4 @@ CREATE QUERY predicting() FOR GRAPH CitationGraph SYNTAX V2 { ; PRINT @@accurate_cnt/test_cnt AS accuracy; PRINT @@Graph,Start[Start.@prediction,Start.class_label]; -} \ No newline at end of file +} From 301c24b36428e6c7c3f2b75a86f64da6e67c103d Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Tue, 19 Jul 2022 15:24:39 -0400 Subject: [PATCH 07/93] Update predicting.gsql --- .../db_scripts/queries/predicting.gsql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Graph-Convolutional-Networks/db_scripts/queries/predicting.gsql b/Graph-Convolutional-Networks/db_scripts/queries/predicting.gsql index 31a025c..76ea849 100644 --- a/Graph-Convolutional-Networks/db_scripts/queries/predicting.gsql +++ b/Graph-Convolutional-Networks/db_scripts/queries/predicting.gsql @@ -1,8 +1,8 @@ CREATE QUERY predicting() FOR GRAPH CitationGraph SYNTAX V2 { /* - Predict paper predictions from CitationGraph and return - accuracy. + Predict the class of the papers in a citation network and + return accuracies. No inputs From e40dcd3444e8c8cc2ab3833fa261f90a6fa9f92c Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Wed, 20 Jul 2022 15:38:24 -0400 Subject: [PATCH 08/93] Update app_impact.gsql --- .../db_scripts/queries/app_impact.gsql | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/Network-and-IT-Resource-Optimization/db_scripts/queries/app_impact.gsql b/Network-and-IT-Resource-Optimization/db_scripts/queries/app_impact.gsql index 344a983..2b23268 100644 --- a/Network-and-IT-Resource-Optimization/db_scripts/queries/app_impact.gsql +++ b/Network-and-IT-Resource-Optimization/db_scripts/queries/app_impact.gsql @@ -1,4 +1,20 @@ CREATE QUERY app_impact(vertex a, float decay, int k) FOR GRAPH Storage SYNTAX V2 { +/* + + Detect the top k applications which have the most impact on a given + application. + + Sample inputs: + a: + decay: 0.5 + k: 5 + + Starting from an input application, + (1) Find all applications connected to the input application with + an AppCall edge and calculate their impact score and edge set. + (2) Display the resulting applications ordered by impact score. + +*/ int iteration = 0; @@ -24,4 +40,4 @@ CREATE QUERY app_impact(vertex a, float decay, int k) FOR GRAPH Sto LIMIT k; print Result; -} \ No newline at end of file +} From aa09c8ce878883fde265d121b7a4a22ecafc356f Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Wed, 20 Jul 2022 15:44:01 -0400 Subject: [PATCH 09/93] Update storage_impact.gsql --- .../db_scripts/queries/storage_impact.gsql | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/Network-and-IT-Resource-Optimization/db_scripts/queries/storage_impact.gsql b/Network-and-IT-Resource-Optimization/db_scripts/queries/storage_impact.gsql index 48b11ac..e65042c 100644 --- a/Network-and-IT-Resource-Optimization/db_scripts/queries/storage_impact.gsql +++ b/Network-and-IT-Resource-Optimization/db_scripts/queries/storage_impact.gsql @@ -1,4 +1,21 @@ CREATE QUERY storage_impact(string vertexType, vertex input) FOR GRAPH Storage SYNTAX V2 { + +/* + Detect edges that differ from the input type where their goUpper + attribute is true. + + Sample inputs: + vertexType: Application + input: + + Starting from an vertex input, + (1) Find all the vertices with an edge to the input where the + goUpper attribute for the edge is true + (2) Add the edges to a global sum accumulator + (3) Return the edge list + +*/ + OrAccum @@stop; @@ -16,4 +33,4 @@ CREATE QUERY storage_impact(string vertexType, vertex input) FOR GRAPH Storage S end; PRINT @@edgeList; -} \ No newline at end of file +} From 2ec86f2d208fac7899383d0f3cffc7eb35d3fcde Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Wed, 20 Jul 2022 15:51:44 -0400 Subject: [PATCH 10/93] Update warning_impact.gsql --- .../db_scripts/queries/warning_impact.gsql | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/Network-and-IT-Resource-Optimization/db_scripts/queries/warning_impact.gsql b/Network-and-IT-Resource-Optimization/db_scripts/queries/warning_impact.gsql index 9280975..7e1abaf 100644 --- a/Network-and-IT-Resource-Optimization/db_scripts/queries/warning_impact.gsql +++ b/Network-and-IT-Resource-Optimization/db_scripts/queries/warning_impact.gsql @@ -1,4 +1,19 @@ CREATE QUERY warning_impact(vertex inputWarn) FOR GRAPH Storage SYNTAX V2 { +/* + Returns Alert_App, App_Service, and Service Manager edges related + to a warning vertex. + + Sample inputs: + inputWarn: + + Starting from an "inputWarn", + (1) Select all applications connected to the warning and add to + edge list + (2) Select all services related to the applications from part (1) + (3) Select all managers related to the services from part (2) + +*/ + ListAccum @@edge_List; @@ -15,4 +30,4 @@ CREATE QUERY warning_impact(vertex inputWarn) FOR GRAPH Storage SYNTAX PRINT Man; PRINT @@edge_List; -} \ No newline at end of file +} From a64aa73d55b7b1c47df49dfc8d9d030488991022 Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Wed, 20 Jul 2022 15:54:42 -0400 Subject: [PATCH 11/93] Create README.gsql --- .../db_scripts/queries/README.gsql | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 Network-and-IT-Resource-Optimization/db_scripts/queries/README.gsql diff --git a/Network-and-IT-Resource-Optimization/db_scripts/queries/README.gsql b/Network-and-IT-Resource-Optimization/db_scripts/queries/README.gsql new file mode 100644 index 0000000..b82ca7f --- /dev/null +++ b/Network-and-IT-Resource-Optimization/db_scripts/queries/README.gsql @@ -0,0 +1,6 @@ +CREATE QUERY README() FOR GRAPH Storage { + PRINT "Network and IT resource graph for modeling and analyzing"; + PRINT "the impact of the hardware outage on workloads."; + + PRINT "The queries do not need to be run in a specific order."; +} From c4776eed70894fbcabc0aa75374e57c70bec6958 Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Wed, 20 Jul 2022 15:56:36 -0400 Subject: [PATCH 12/93] Update README.gsql --- .../db_scripts/queries/README.gsql | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/Network-and-IT-Resource-Optimization/db_scripts/queries/README.gsql b/Network-and-IT-Resource-Optimization/db_scripts/queries/README.gsql index b82ca7f..03e0393 100644 --- a/Network-and-IT-Resource-Optimization/db_scripts/queries/README.gsql +++ b/Network-and-IT-Resource-Optimization/db_scripts/queries/README.gsql @@ -1,6 +1,10 @@ CREATE QUERY README() FOR GRAPH Storage { - PRINT "Network and IT resource graph for modeling and analyzing"; - PRINT "the impact of the hardware outage on workloads."; + /* + Network and IT resource graph for modeling and analyzing + the impact of the hardware outage on workloads. + + The queries do not need to be run in a specific order. + */ - PRINT "The queries do not need to be run in a specific order."; + PRINT "README worked!"; } From 8465933aebb853cb607c9bfdcb010ea6bc2bf581 Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Thu, 21 Jul 2022 15:38:22 -0400 Subject: [PATCH 13/93] Update predicting.gsql --- Graph-Convolutional-Networks/db_scripts/queries/predicting.gsql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Graph-Convolutional-Networks/db_scripts/queries/predicting.gsql b/Graph-Convolutional-Networks/db_scripts/queries/predicting.gsql index 76ea849..d9f7c0b 100644 --- a/Graph-Convolutional-Networks/db_scripts/queries/predicting.gsql +++ b/Graph-Convolutional-Networks/db_scripts/queries/predicting.gsql @@ -1,7 +1,7 @@ CREATE QUERY predicting() FOR GRAPH CitationGraph SYNTAX V2 { /* - Predict the class of the papers in a citation network and + Predicts the class of the papers in a citation network and return accuracies. No inputs From 02566b5ed0016ccd3a98994684d60e8ff076cf16 Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Thu, 21 Jul 2022 15:38:57 -0400 Subject: [PATCH 14/93] Update predicting.gsql --- Graph-Convolutional-Networks/db_scripts/queries/predicting.gsql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Graph-Convolutional-Networks/db_scripts/queries/predicting.gsql b/Graph-Convolutional-Networks/db_scripts/queries/predicting.gsql index d9f7c0b..40cd9b2 100644 --- a/Graph-Convolutional-Networks/db_scripts/queries/predicting.gsql +++ b/Graph-Convolutional-Networks/db_scripts/queries/predicting.gsql @@ -55,7 +55,7 @@ CREATE QUERY predicting() FOR GRAPH CitationGraph SYNTAX V2 { ACCUM t.@z_0 += product_ArrayAccum_const(s.@zeta_0,e.weight) POST-ACCUM s.@z_0 = ReLU_ArrayAccum(s.@z_0), - // hidden layer0 -> hidden layer1 */ + // hidden layer0 -> hidden layer1 */ s.@zeta_1 += product_Matrix_Vector(@@W_1, s.@z_0) ; From d352d9934ab0a385e8d5de959eebb979bda18be9 Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Thu, 21 Jul 2022 15:39:09 -0400 Subject: [PATCH 15/93] Update README.gsql --- .../db_scripts/queries/README.gsql | 43 +++++++++++-------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/Graph-Convolutional-Networks/db_scripts/queries/README.gsql b/Graph-Convolutional-Networks/db_scripts/queries/README.gsql index e7988f0..81e42a9 100644 --- a/Graph-Convolutional-Networks/db_scripts/queries/README.gsql +++ b/Graph-Convolutional-Networks/db_scripts/queries/README.gsql @@ -1,21 +1,26 @@ -CREATE QUERY README(/* Parameters here */) FOR GRAPH CitationGraph SYNTAX V2 { - /* -The recommendation system can predict the movie ratings based on the latent factor (model-based) method. -To train the latent factor model, run the queries below in sequence -The graph convolutional network (GCN) is applied for node classification. -Specifically in this starter kit, it is used to prediction the class of the papers in a citation network -The hyperparameters in the GCN model is suggested in Thomas N. Kipf and Max Welling, ICLR (2017). -To train the GCN, the order of the queries below must be followed to obtain the useful prediction. +CREATE QUERY README(/* Parameters here */) FOR GRAPH CitationGraph SYNTAX V2 { -1. initialization -2. weight_initialization -3. training -4. predicting - -To re-train model using different training data split, -users can modify the initialization query before repeat the steps above. -The order of the queries need to be followed to ensure the correctness of the result. - */ +STRING graph_description = "The recommendation system can predict the movie ratings based on the latent + factor (model-based) method. To train the latent factor model, run the queries below in sequence + The graph convolutional network (GCN) is applied for node classification. + Specifically in this starter kit, it is used to prediction the class of the papers in a citation + network The hyperparameters in the GCN model is suggested in Thomas N. Kipf and Max Welling, ICLR (2017). + To train the GCN, the order of the queries below must be followed to obtain the useful prediction."; - PRINT "README worked!"; -} \ No newline at end of file +STRING query_order = "1. initialization, 2. weight_initialization, 3. training, 4. predicting"; +STRING order_note = "To re-train model using different training data split, + users can modify the initialization query before repeat the steps above. + The order of the queries need to be followed to ensure the correctness of the result."; + +STRING initialization = "Initializes weights on edges and attributes before splitting vertices into + training, validation, and testing sets."; +STRING weight_initialization = "Initializes the weights for the neural network."; +STRING training = "Trains the graph convolutional neural network on the training dataset + and evaluates the loss on the validation data and the prediction accuracy on the + testing data."; +STRING predicting = "Predicts the class of the papers in a citation network and + return accuracies."; + +PRINT graph_description, query_order, order_note, initialization, weight_initialization, training, predicting; + +} From b688742a889c2d002d31dd088f8bcbafadfbc84f Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Thu, 21 Jul 2022 15:42:00 -0400 Subject: [PATCH 16/93] Update app_impact.gsql --- .../db_scripts/queries/app_impact.gsql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Network-and-IT-Resource-Optimization/db_scripts/queries/app_impact.gsql b/Network-and-IT-Resource-Optimization/db_scripts/queries/app_impact.gsql index 2b23268..5cdde36 100644 --- a/Network-and-IT-Resource-Optimization/db_scripts/queries/app_impact.gsql +++ b/Network-and-IT-Resource-Optimization/db_scripts/queries/app_impact.gsql @@ -35,9 +35,9 @@ CREATE QUERY app_impact(vertex a, float decay, int k) FOR GRAPH Sto result = result UNION start; end; - Result = SELECT s from result:s + final = SELECT s from result:s ORDER BY s.@impact_score DESC LIMIT k; - print Result; + print final; } From 1f4a859afbde8641beb568a901fed4923ac91cf7 Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Thu, 21 Jul 2022 15:42:55 -0400 Subject: [PATCH 17/93] Update training.gsql --- .../db_scripts/queries/training.gsql | 9 --------- 1 file changed, 9 deletions(-) diff --git a/Graph-Convolutional-Networks/db_scripts/queries/training.gsql b/Graph-Convolutional-Networks/db_scripts/queries/training.gsql index 2232eda..2a95859 100644 --- a/Graph-Convolutional-Networks/db_scripts/queries/training.gsql +++ b/Graph-Convolutional-Networks/db_scripts/queries/training.gsql @@ -22,15 +22,6 @@ FOR GRAPH CitationGraph SYNTAX V2 { No sample inputs - Default inputs: - alpha0 = 0.4 - Adam = True - beta1 = 0.9 - beta2 = 0.999 - keepProb = 1.0 - lambda = 0.00005 - MaxIter = 10 - Start with all WORD vertices: (1) Load weights into layer 0 and layer 1 From 47a6b0ca10c85aaf669da0561683f817d3534888 Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Thu, 21 Jul 2022 15:43:31 -0400 Subject: [PATCH 18/93] Update predicting.gsql --- Graph-Convolutional-Networks/db_scripts/queries/predicting.gsql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Graph-Convolutional-Networks/db_scripts/queries/predicting.gsql b/Graph-Convolutional-Networks/db_scripts/queries/predicting.gsql index 40cd9b2..2b062aa 100644 --- a/Graph-Convolutional-Networks/db_scripts/queries/predicting.gsql +++ b/Graph-Convolutional-Networks/db_scripts/queries/predicting.gsql @@ -2,7 +2,7 @@ CREATE QUERY predicting() FOR GRAPH CitationGraph SYNTAX V2 { /* Predicts the class of the papers in a citation network and - return accuracies. + returns accuracies. No inputs From b068f1468471f8a8aa543902cbe5a342fc0dcdda Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Thu, 21 Jul 2022 16:06:42 -0400 Subject: [PATCH 19/93] Update app_impact.gsql --- .../db_scripts/queries/app_impact.gsql | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Network-and-IT-Resource-Optimization/db_scripts/queries/app_impact.gsql b/Network-and-IT-Resource-Optimization/db_scripts/queries/app_impact.gsql index 5cdde36..9764951 100644 --- a/Network-and-IT-Resource-Optimization/db_scripts/queries/app_impact.gsql +++ b/Network-and-IT-Resource-Optimization/db_scripts/queries/app_impact.gsql @@ -5,9 +5,9 @@ CREATE QUERY app_impact(vertex a, float decay, int k) FOR GRAPH Sto application. Sample inputs: - a: - decay: 0.5 - k: 5 + a: 9998 | 9649 | 5679 + decay: 0.5 | 0.8 | 1 + k: 5 | 4 | 10 Starting from an input application, (1) Find all applications connected to the input application with From 7501261b589ee45d6f9fd85fb7222bf65aa2476f Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Thu, 21 Jul 2022 16:57:03 -0400 Subject: [PATCH 20/93] Update warning_impact.gsql --- .../db_scripts/queries/warning_impact.gsql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Network-and-IT-Resource-Optimization/db_scripts/queries/warning_impact.gsql b/Network-and-IT-Resource-Optimization/db_scripts/queries/warning_impact.gsql index 7e1abaf..7e3663b 100644 --- a/Network-and-IT-Resource-Optimization/db_scripts/queries/warning_impact.gsql +++ b/Network-and-IT-Resource-Optimization/db_scripts/queries/warning_impact.gsql @@ -4,7 +4,7 @@ CREATE QUERY warning_impact(vertex inputWarn) FOR GRAPH Storage SYNTAX to a warning vertex. Sample inputs: - inputWarn: + inputWarn: 2000821 Starting from an "inputWarn", (1) Select all applications connected to the warning and add to From 1fd16c24741487d177f5d9667c1a24f09344eb09 Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Thu, 21 Jul 2022 17:05:36 -0400 Subject: [PATCH 21/93] Update storage_impact.gsql --- .../db_scripts/queries/storage_impact.gsql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Network-and-IT-Resource-Optimization/db_scripts/queries/storage_impact.gsql b/Network-and-IT-Resource-Optimization/db_scripts/queries/storage_impact.gsql index e65042c..498ad8d 100644 --- a/Network-and-IT-Resource-Optimization/db_scripts/queries/storage_impact.gsql +++ b/Network-and-IT-Resource-Optimization/db_scripts/queries/storage_impact.gsql @@ -2,7 +2,7 @@ CREATE QUERY storage_impact(string vertexType, vertex input) FOR GRAPH Storage S /* Detect edges that differ from the input type where their goUpper - attribute is true. + storage attribute is true. Sample inputs: vertexType: Application From 8aa444b65c69fcd5908bb91986aea4b164c92b3b Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Thu, 21 Jul 2022 17:07:11 -0400 Subject: [PATCH 22/93] Update README.gsql --- .../db_scripts/queries/README.gsql | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/Network-and-IT-Resource-Optimization/db_scripts/queries/README.gsql b/Network-and-IT-Resource-Optimization/db_scripts/queries/README.gsql index 03e0393..752a602 100644 --- a/Network-and-IT-Resource-Optimization/db_scripts/queries/README.gsql +++ b/Network-and-IT-Resource-Optimization/db_scripts/queries/README.gsql @@ -1,10 +1,14 @@ CREATE QUERY README() FOR GRAPH Storage { - /* - Network and IT resource graph for modeling and analyzing - the impact of the hardware outage on workloads. + STRING graph_description = "Network and IT resource graph for modeling and analyzing + the impact of the hardware outage on workloads."; + + STRING query_order = "no order"; + STRING app_impact = "Detect the top k applications which have the most impact on a given + application."; + STRING storage_impact = "Detect edges that differ from the input type where their goUpper + storage attribute is true."; + STRING warning_impact = "Returns Alert_App, App_Service, and Service Manager edges related + to a warning vertex"; - The queries do not need to be run in a specific order. - */ - - PRINT "README worked!"; + PRINT graph_description, query_order_query_name; } From 19b2c6cc9d7e949b129f218a69e10aefd038f827 Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Thu, 21 Jul 2022 17:31:47 -0400 Subject: [PATCH 23/93] Update splitData.gsql --- .../db_scripts/queries/splitData.gsql | 21 +++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/In-Database-Machine-Learning-Recommendation/db_scripts/queries/splitData.gsql b/In-Database-Machine-Learning-Recommendation/db_scripts/queries/splitData.gsql index 8869de1..5395a76 100644 --- a/In-Database-Machine-Learning-Recommendation/db_scripts/queries/splitData.gsql +++ b/In-Database-Machine-Learning-Recommendation/db_scripts/queries/splitData.gsql @@ -1,8 +1,17 @@ -CREATE QUERY split_data() FOR GRAPH Recommender SYNTAX V2 { - // This query split rating data into validation set and training set. - // The fraction of testing data is set to be 30%. (i.e. 30% of the rating data will be used for model validation - // and the rest 70% will be used for model training). - // This query also output the size of total data set, the validation data set and the training data set. +CREATE QUERY split_data() FOR GRAPH Recommender SYNTAX V2 { +/* + Splits rating data into validation set and training set with 30% data + for testing. + + No inputs + + Starting from all USER vertices: + (1) Select the validation data and training data + (2) Returns the size of the total data set, validation data set, and + training data set + +*/ + SumAccum @@cnt_total; SumAccum @@cnt_validation; SumAccum @@cnt_training; @@ -26,4 +35,4 @@ CREATE QUERY split_data() FOR GRAPH Recommender SYNTAX V2 { PRINT @@cnt_total,@@cnt_validation,@@cnt_training; // print out the size of total data set, the validation data set and the training data set -} \ No newline at end of file +} From 98f8f5c292523a0c35756eb7d6332f0a445929d9 Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Thu, 21 Jul 2022 17:35:45 -0400 Subject: [PATCH 24/93] Update normalization.gsql --- .../db_scripts/queries/normalization.gsql | 20 +++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/In-Database-Machine-Learning-Recommendation/db_scripts/queries/normalization.gsql b/In-Database-Machine-Learning-Recommendation/db_scripts/queries/normalization.gsql index e197e83..d09f8fc 100644 --- a/In-Database-Machine-Learning-Recommendation/db_scripts/queries/normalization.gsql +++ b/In-Database-Machine-Learning-Recommendation/db_scripts/queries/normalization.gsql @@ -1,6 +1,18 @@ -CREATE QUERY normalization(/* Parameters here */) FOR GRAPH Recommender SYNTAX V2 { - // This query normalizes the ratings by substracting each rating by the average rating of the movie. - // The average rating of each movie is computed from the training data +CREATE QUERY normalization(/* Parameters here */) FOR GRAPH Recommender SYNTAX V2 { +/* + Normalizes the ratings by subtracting each rating by the + average rating of the movie from the training data. + + No inputs + + Starting with all MOVIE vertices: + (1) Select movies from the training data and calculate + the average rating + (2) Subtract the average rating from each movie's + rating + +*/ + AvgAccum @avg_rating; Start = {MOVIE.*}; Start = SELECT s FROM Start:s -(rate:e)- USER:t @@ -15,4 +27,4 @@ CREATE QUERY normalization(/* Parameters here */) FOR GRAPH Recommender SYNTAX V Start = SELECT s FROM Start:s -(rate:e)- USER:t ACCUM e.rating = e.rating - s.@avg_rating; //substract each rating by the average rating of the movie -} \ No newline at end of file +} From f5f65647745d8de5892add9bb253b4e423d5936e Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Thu, 21 Jul 2022 18:02:53 -0400 Subject: [PATCH 25/93] Update initialization.gsql --- .../db_scripts/queries/initialization.gsql | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/In-Database-Machine-Learning-Recommendation/db_scripts/queries/initialization.gsql b/In-Database-Machine-Learning-Recommendation/db_scripts/queries/initialization.gsql index 16e4697..a66a837 100644 --- a/In-Database-Machine-Learning-Recommendation/db_scripts/queries/initialization.gsql +++ b/In-Database-Machine-Learning-Recommendation/db_scripts/queries/initialization.gsql @@ -1,8 +1,23 @@ CREATE QUERY initialization(INT num_latent_factors = 19) FOR GRAPH Recommender SYNTAX V2 { +/* + Initializes the latent factor vectors for the users and the movies + by a normal distributed random number generator. + + No inputs + + Starting with all MOVIE vertices: + (1) Assign a random number to the latent factor vectors of the + movies + + Using all USER vertices: + (1) Assign a random number to the latent factor vectors of the + users + +*/ // This query initialize the latent factor vectors for the users and the movies // The elements in the latent factor vectors are initialized by a normal distributed random number generator // The query inputs are the standard deviation and the mean of the normal distribution - + ListAccum @init; //The length of the latent factor vector (i.e. the number of features) is set as 19. This number has to be the same as the num_latent_factors in the training query @@ -25,4 +40,4 @@ CREATE QUERY initialization(INT num_latent_factors = 19) FOR GRAPH Recommender S s.theta = s.@init; PRINT "Initialization Completed"; -} \ No newline at end of file +} From d19e50767826085fcd6f0573b657cd484c409121 Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Tue, 2 Aug 2022 02:46:04 -0400 Subject: [PATCH 26/93] Update training.gsql --- .../db_scripts/queries/training.gsql | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/In-Database-Machine-Learning-Recommendation/db_scripts/queries/training.gsql b/In-Database-Machine-Learning-Recommendation/db_scripts/queries/training.gsql index 5096df1..7e003f2 100644 --- a/In-Database-Machine-Learning-Recommendation/db_scripts/queries/training.gsql +++ b/In-Database-Machine-Learning-Recommendation/db_scripts/queries/training.gsql @@ -1,4 +1,16 @@ CREATE QUERY training(DOUBLE learning_rate = 0.001, DOUBLE regularization_factor = 0.00005, INT Iter=100) FOR GRAPH Recommender SYNTAX V2 { + /* + Trains recommender model using the gradient descent algorithm. + + No inputs + + Starting with all movies vertices and all user vertices: + (1) Pass x and theta to local accum + (2) Obtain the latent factor vectors using gradient descent algorithm + (3) Outputs the root mean square error (RMSE) for every iteration + */ + + //This query trains the recommender model using gradient descent algorithm //The number of features is set as 19. This number has to be the same as the num_latent_factors in the initialization query //The query inputs are the learning rate, regularization_factor and the number of training iterations @@ -88,4 +100,4 @@ CREATE QUERY training(DOUBLE learning_rate = 0.001, DOUBLE regularization_factor END, s.theta = s.@tmp; -} \ No newline at end of file +} From dd369b6f006d8a3bb209a0a145bef99d74e6251b Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Tue, 2 Aug 2022 02:49:38 -0400 Subject: [PATCH 27/93] Update test.gsql --- .../db_scripts/queries/test.gsql | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/In-Database-Machine-Learning-Recommendation/db_scripts/queries/test.gsql b/In-Database-Machine-Learning-Recommendation/db_scripts/queries/test.gsql index 5cb9fcb..3337896 100644 --- a/In-Database-Machine-Learning-Recommendation/db_scripts/queries/test.gsql +++ b/In-Database-Machine-Learning-Recommendation/db_scripts/queries/test.gsql @@ -1,4 +1,19 @@ CREATE QUERY test(VERTEX user) FOR GRAPH Recommender SYNTAX V2 { + +/* + Outputs the real ratings provided by a user together with the predicted rating + by the model + + Sample Input: + user: + + Starting with a user vertex: + (1) Select movies with edges to the user + (2) Sum accum to the predicted rating and average rating of the movie + (3) Print the real ratings by the user and the predicted model + ratings + +*/ //This query output the real ratings provided by a user together with the predicted rating by the model //The query input is a user id //The query output is all the ratings given by the user and the ratings prediction @@ -12,4 +27,4 @@ CREATE QUERY test(VERTEX user) FOR GRAPH Recommender SYNTAX V2 { t.@real_rating += e.rating+t.avg_rating; PRINT Start[Start.@real_rating,Start.@predicted_rating]; -} \ No newline at end of file +} From 0e25fa68853dad0a1543d5fc581458fa105cd96e Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Tue, 2 Aug 2022 02:50:57 -0400 Subject: [PATCH 28/93] Update README.gsql --- .../db_scripts/queries/README.gsql | 39 ++++++++++++------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/In-Database-Machine-Learning-Recommendation/db_scripts/queries/README.gsql b/In-Database-Machine-Learning-Recommendation/db_scripts/queries/README.gsql index 72b5de7..6e4b5f1 100644 --- a/In-Database-Machine-Learning-Recommendation/db_scripts/queries/README.gsql +++ b/In-Database-Machine-Learning-Recommendation/db_scripts/queries/README.gsql @@ -1,19 +1,30 @@ CREATE QUERY README(/* Parameters here */) FOR GRAPH Recommender SYNTAX V2 { /* -The recommendation system can predict the movie ratings based on the latent factor (model-based) method. -To train the latent factor model, run the queries below in sequence -1. splitData -2. normalization -3. initialization -4. training + To re-train model using different training data split, + the data need to be reloaded before repeating the steps. + The order of the queries need to be followed to ensure the + correctness of the result. +*/ -To test the model and use it for recommendation, run the queries below -1. test -2. recommend +STRING graph_name = "In Database Machine Learning Recommendation"; +STRING graph_description = "Provides content and products suggestions using an in-database machine learning recommendation system. + The recommendation system can predict the movie ratings based on the latent factor (model-based) method."; + +STRING query_train_order = "1. splitData, 2. normalization, 3. initialization, 4. training"; +STRING query_test_order = "1. test, 2. recommend"; -To re-train model using different training data split, the data need to be reloaded before repeat the steps above. -The order of the queries need to be followed to ensure the correctness of the result. -*/ +STRING splitData = "Splits rating data into validation set and training set with 30% data + for testing."; +STRING normalization = "Normalizes the ratings by subtracting each rating by the + average rating of the movie from the training data."; +STRING initialization = "Initializes the latent factor vectors for the users and the movies + by a normal distributed random number generator."; +STRING training = "Trains recommender model using the gradient descent algorithm."; +STRING test = "Outputs the real ratings provided by a user together with the predicted rating by the model."; +STRING recommend = "Outputs the top-10 movies recommended to a user"; +STRING cal_avg_rating = "Calculates the average rating across all movies."; + + +PRINT graph_name, graph_description, query_train_order, query_test_order, splitData, normalization, initialization, training, test, recommend,cal_avg_rating; - PRINT "README works!"; -} \ No newline at end of file +} From d9b65a267b94f564296b620676d9ad3878772c66 Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Tue, 2 Aug 2022 02:52:49 -0400 Subject: [PATCH 29/93] Update cal_avg_rating.gsql --- .../db_scripts/queries/cal_avg_rating.gsql | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/In-Database-Machine-Learning-Recommendation/db_scripts/queries/cal_avg_rating.gsql b/In-Database-Machine-Learning-Recommendation/db_scripts/queries/cal_avg_rating.gsql index e08196b..e88b478 100644 --- a/In-Database-Machine-Learning-Recommendation/db_scripts/queries/cal_avg_rating.gsql +++ b/In-Database-Machine-Learning-Recommendation/db_scripts/queries/cal_avg_rating.gsql @@ -1,5 +1,14 @@ CREATE QUERY cal_avg_rating(/* Parameters here */) FOR GRAPH Recommender SYNTAX V2 { - /* Write query logic here */ + /* + Returns average rating of movies + + No inputs: + + Starting with all movies: + (1) Select movies with edges to users + (2) Accum the average rating from the edges + +*/ AvgAccum @avg_rating; Start = {MOVIE.*}; Start = SELECT s FROM Start:s -(rate:e)- USER:t @@ -8,4 +17,4 @@ CREATE QUERY cal_avg_rating(/* Parameters here */) FOR GRAPH Recommender SYNTAX POST-ACCUM s.avg_rating = s.@avg_rating; PRINT Start; -} \ No newline at end of file +} From 4aea2da694919f5e853cd3a30ea6f1d8aaeb98b3 Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Tue, 2 Aug 2022 02:55:04 -0400 Subject: [PATCH 30/93] Update recommend.gsql --- .../db_scripts/queries/recommend.gsql | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/In-Database-Machine-Learning-Recommendation/db_scripts/queries/recommend.gsql b/In-Database-Machine-Learning-Recommendation/db_scripts/queries/recommend.gsql index e9bcf01..e72be8c 100644 --- a/In-Database-Machine-Learning-Recommendation/db_scripts/queries/recommend.gsql +++ b/In-Database-Machine-Learning-Recommendation/db_scripts/queries/recommend.gsql @@ -1,4 +1,16 @@ CREATE QUERY recommend(VERTEX user) FOR GRAPH Recommender SYNTAX V2 { +/* + Outputs the top-10 movies recommended to a user. The movies are recommended + based on the rating prediction. + + Sample inputs: + user: 1 | 2 + + Starting with all movies: + (1) Compute the rating prediciton based on the model + (2) Get top 10 predicted ratings descending + +*/ //This query output the top-10 movies recommended to a user //The movies are recommended based on the rating prediction SumAccum @predicted_rating; @@ -12,4 +24,4 @@ CREATE QUERY recommend(VERTEX user) FOR GRAPH Recommender SYNTAX V2 { LIMIT 10; PRINT "Recommendation (based on model)"; PRINT MOVIEs[MOVIEs.name, MOVIEs.@predicted_rating]; -} \ No newline at end of file +} From 02fc8456ec358dfb3f83ed3c93305153361a589e Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Tue, 2 Aug 2022 02:55:24 -0400 Subject: [PATCH 31/93] Update test.gsql --- .../db_scripts/queries/test.gsql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/In-Database-Machine-Learning-Recommendation/db_scripts/queries/test.gsql b/In-Database-Machine-Learning-Recommendation/db_scripts/queries/test.gsql index 3337896..d9bff61 100644 --- a/In-Database-Machine-Learning-Recommendation/db_scripts/queries/test.gsql +++ b/In-Database-Machine-Learning-Recommendation/db_scripts/queries/test.gsql @@ -5,7 +5,7 @@ CREATE QUERY test(VERTEX user) FOR GRAPH Recommender SYNTAX V2 { by the model Sample Input: - user: + user: 1 | 2 Starting with a user vertex: (1) Select movies with edges to the user From aa105ac255da3dc02e4163d4c9eb44177b9c35b1 Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Wed, 3 Aug 2022 01:45:27 -0400 Subject: [PATCH 32/93] Update README.gsql --- .../db_scripts/queries/README.gsql | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/In-Database-Machine-Learning-Recommendation/db_scripts/queries/README.gsql b/In-Database-Machine-Learning-Recommendation/db_scripts/queries/README.gsql index 6e4b5f1..aa651b5 100644 --- a/In-Database-Machine-Learning-Recommendation/db_scripts/queries/README.gsql +++ b/In-Database-Machine-Learning-Recommendation/db_scripts/queries/README.gsql @@ -7,16 +7,18 @@ CREATE QUERY README(/* Parameters here */) FOR GRAPH Recommender SYNTAX V2 { */ STRING graph_name = "In Database Machine Learning Recommendation"; -STRING graph_description = "Provides content and products suggestions using an in-database machine learning recommendation system. - The recommendation system can predict the movie ratings based on the latent factor (model-based) method."; +STRING graph_description = "Provides content and products suggestions" + + "using an in-database machine learning recommendation system. " + + "The recommendation system can predict the movie ratings based " + + "on the latent factor (model-based) method."; STRING query_train_order = "1. splitData, 2. normalization, 3. initialization, 4. training"; STRING query_test_order = "1. test, 2. recommend"; -STRING splitData = "Splits rating data into validation set and training set with 30% data - for testing."; -STRING normalization = "Normalizes the ratings by subtracting each rating by the - average rating of the movie from the training data."; +STRING splitData = "Splits rating data into validation set and training set " + + "with 30% data for testing."; +STRING normalization = "Normalizes the ratings by subtracting each rating by " + + "the average rating of the movie from the training data."; STRING initialization = "Initializes the latent factor vectors for the users and the movies by a normal distributed random number generator."; STRING training = "Trains recommender model using the gradient descent algorithm."; From cfb40fe35bb58157ceeefbccd069ea49fe4395b8 Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Wed, 3 Aug 2022 01:52:52 -0400 Subject: [PATCH 33/93] Update feature_collection.gsql --- .../db_scripts/queries/feature_collection.gsql | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/Machine-Learning-and-Real-time-Fraud-Detection/db_scripts/queries/feature_collection.gsql b/Machine-Learning-and-Real-time-Fraud-Detection/db_scripts/queries/feature_collection.gsql index 66206ce..5e6f174 100644 --- a/Machine-Learning-and-Real-time-Fraud-Detection/db_scripts/queries/feature_collection.gsql +++ b/Machine-Learning-and-Real-time-Fraud-Detection/db_scripts/queries/feature_collection.gsql @@ -1,4 +1,19 @@ CREATE query feature_collection(vertex phoneId, int durationLimit = 600, int numOfCallLimit = 10) for graph sdmGraph SYNTAX V2 { + /* + Returns stable connections given phone id, number of calls, and duration limit. + + Sample inputs: + phoneId: 1 | 2 + + Starting with a seed phoneID: + (1) Select phone ids with edges to the seed. + (2) Store stable connection edges and stable targets + (3) Update neighbor information among target group + (4) Count stable calls + (5) Return seed, target group, edges, and stable connection list + +*/ + TYPEDEF TUPLE CallInfo; SumAccum @stableCount; GroupByAccum tid, ListAccum callInfo> @NB_Info; @@ -65,4 +80,4 @@ CREATE query feature_collection(vertex phoneId, int durationLimit = 600, PRINT TargetGroup; PRINT @@target_Group_Edge_List; PRINT @@in_Group_Stable_Connection_List; -} \ No newline at end of file +} From da15c15c4a8886a7296c84f129dbf5ad03239f4e Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Wed, 3 Aug 2022 01:55:48 -0400 Subject: [PATCH 34/93] Create README.gsql --- .../db_scripts/queries/README.gsql | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 Machine-Learning-and-Real-time-Fraud-Detection/db_scripts/queries/README.gsql diff --git a/Machine-Learning-and-Real-time-Fraud-Detection/db_scripts/queries/README.gsql b/Machine-Learning-and-Real-time-Fraud-Detection/db_scripts/queries/README.gsql new file mode 100644 index 0000000..c79bae2 --- /dev/null +++ b/Machine-Learning-and-Real-time-Fraud-Detection/db_scripts/queries/README.gsql @@ -0,0 +1,13 @@ +CREATE QUERY README(/* Parameters here */) FOR GRAPH sdmGraph SYNTAX V2 { + +STRING graph_name = "Machine-Learning-and-Real-time-Fraud-Detection"; +STRING graph_description = "Mobile Industry example for detecting fraud in real-time " + + "and generating graph-based features for training the machine learning solution"; + +STRING query_order = "No order"; + +STRING feature_collection = "Returns stable connections given phone id, number of calls, and duration limit."; + +PRINT graph_name, graph_description, query_order, feature_collection; + +} From b58e94ac265148ebd2b4e247903e8f32ad7908e6 Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Wed, 3 Aug 2022 02:00:05 -0400 Subject: [PATCH 35/93] Update compare_approximation.gsql --- .../queries/compare_approximation.gsql | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/Low-Rank-Approximation-Machine-Learning/db_scripts/queries/compare_approximation.gsql b/Low-Rank-Approximation-Machine-Learning/db_scripts/queries/compare_approximation.gsql index 2a5eeb3..3f16ebd 100644 --- a/Low-Rank-Approximation-Machine-Learning/db_scripts/queries/compare_approximation.gsql +++ b/Low-Rank-Approximation-Machine-Learning/db_scripts/queries/compare_approximation.gsql @@ -1,4 +1,19 @@ CREATE QUERY compare_approximation(VERTEX row_index) FOR GRAPH LowRankApproximation SYNTAX V2 { +/* + Outputs the element values of one row of the loaded matrix specified by the + inputed row index together with the approximated element + + Sample inputs: + row_index: + + Starting with a row index: + (1) Get columns of elements in row + (2) Accum to approximated values and real values + t.@approximated_value += dotProduct_List_List(s.u,t.v), + t.@real_value += e.element_value + (3) Outputs all the existing element values in the given row +*/ + // This query output the element values of one row of the loaded matrix specified by the inputed row index together with the approximated element values // The query input is a row index // The query output is all the existing element values in the given row. The column index is shown as the v_id of the MATRIX_COLUMN vertex. @@ -13,4 +28,4 @@ CREATE QUERY compare_approximation(VERTEX row_index) FOR GRAPH LowRa ORDER BY str_to_int(t.column_index) ASC; PRINT MATRIX[MATRIX.@real_value,MATRIX.@approximated_value]; -} \ No newline at end of file +} From 6ba20a7cc4beea39d1cc9d4e9cfc3ca1c0c392b4 Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Wed, 3 Aug 2022 02:04:15 -0400 Subject: [PATCH 36/93] Update initialization.gsql --- .../db_scripts/queries/initialization.gsql | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/Low-Rank-Approximation-Machine-Learning/db_scripts/queries/initialization.gsql b/Low-Rank-Approximation-Machine-Learning/db_scripts/queries/initialization.gsql index 9b0d3e0..ca97ade 100644 --- a/Low-Rank-Approximation-Machine-Learning/db_scripts/queries/initialization.gsql +++ b/Low-Rank-Approximation-Machine-Learning/db_scripts/queries/initialization.gsql @@ -1,4 +1,17 @@ CREATE QUERY initialization(float sdv = 0.1, float mean = 0.1) FOR GRAPH LowRankApproximation SYNTAX V2 { + /* + Initializes the row vectors for the matrix U and the matrix V where elements + are initialized by a normal distributed random number generator. + + No inputs + + Starting with all matrix rows + (1) Assign the random number to the row vectors of the matrix U + + Starting with all matrix columns + (2) Assign the random number to the row vectors of the matrix V + */ + // This query initialize the row vectors for the matrix U and the matrix V // The elements in the row vectors are initialized by a normal distributed random number generator // The query inputs are the standard deviation and the mean of the normal distribution @@ -24,4 +37,4 @@ CREATE QUERY initialization(float sdv = 0.1, float mean = 0.1) FOR GRAPH LowRank END POST-ACCUM s.v = s.@init; -} \ No newline at end of file +} From 53444077bbfb13a478f67f311fd98536db0db21c Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Wed, 3 Aug 2022 02:07:50 -0400 Subject: [PATCH 37/93] Update factorization.gsql --- .../db_scripts/queries/factorization.gsql | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/Low-Rank-Approximation-Machine-Learning/db_scripts/queries/factorization.gsql b/Low-Rank-Approximation-Machine-Learning/db_scripts/queries/factorization.gsql index 99a2ca3..17a39f3 100644 --- a/Low-Rank-Approximation-Machine-Learning/db_scripts/queries/factorization.gsql +++ b/Low-Rank-Approximation-Machine-Learning/db_scripts/queries/factorization.gsql @@ -1,4 +1,16 @@ CREATE QUERY factorization(DOUBLE learning_rate = 0.001, DOUBLE regularization_factor = 0.00005, INT Iter=30) FOR GRAPH LowRankApproximation SYNTAX V2 { + /* + Factorizes the loaded sparse matrix into two low-rank matrices U and V using + gradient descent algorithm. + + No inputs + + Starting with all matrix rows: + (1) Pass u and v to local accum + (2) Obtain the row vectors using gradient descent algorithm + (3) Calculate RMSE + */ + // This query factorize the loaded sparse matrix into two low-rank matrices U and V using gradient descent algorithm // The length of row vectors is set as 19. This number has to be the same as the len_of_rowVector in the initialization query // The query inputs are the learning rate, regularization_factor and the number of iterations @@ -82,4 +94,4 @@ CREATE QUERY factorization(DOUBLE learning_rate = 0.001, DOUBLE regularization_f END, s.v = s.@tmp; -} \ No newline at end of file +} From 74f89bf65ed10f9fc879c24902c47a0bcea6c5f2 Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Wed, 3 Aug 2022 02:11:04 -0400 Subject: [PATCH 38/93] Update print_result.gsql --- .../db_scripts/queries/print_result.gsql | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/Low-Rank-Approximation-Machine-Learning/db_scripts/queries/print_result.gsql b/Low-Rank-Approximation-Machine-Learning/db_scripts/queries/print_result.gsql index 29c6274..d782118 100644 --- a/Low-Rank-Approximation-Machine-Learning/db_scripts/queries/print_result.gsql +++ b/Low-Rank-Approximation-Machine-Learning/db_scripts/queries/print_result.gsql @@ -1,4 +1,18 @@ CREATE QUERY print_result(/* Parameters here */) FOR GRAPH LowRankApproximation SYNTAX V2 { + /* + Prints out two matrices U and V factorized from the loaded sparse matrix. + + No inputs + + Starting with all matrix rows: + (1) Order rows by row index ascending + and print matrix U + + Using all matrix columns: + (1) Order columns by column index ascending + and print matrix V + */ + // This query print out two matrices U and V factorized from the loaded sparse matrix. The row index of the U and V are shown as the v_id of MATRIX_ROW and MATRIX_COLUMN respectively. The row vectors of each row are shown as MATRIX_U.u and MATRIX_V.v. MATRIX_U = {MATRIX_ROW.*}; MATRIX_U = SELECT s FROM MATRIX_U:s @@ -9,4 +23,4 @@ CREATE QUERY print_result(/* Parameters here */) FOR GRAPH LowRankApproximation MATRIX_V = SELECT s FROM MATRIX_V:s ORDER BY str_to_int(s.column_index) ASC; PRINT MATRIX_V [MATRIX_V.v]; -} \ No newline at end of file +} From 4b60eb6354343cb68ca8f309993435b88e0da08d Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Wed, 3 Aug 2022 02:12:44 -0400 Subject: [PATCH 39/93] Update README.gsql --- .../db_scripts/queries/README.gsql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/In-Database-Machine-Learning-Recommendation/db_scripts/queries/README.gsql b/In-Database-Machine-Learning-Recommendation/db_scripts/queries/README.gsql index aa651b5..6786cf4 100644 --- a/In-Database-Machine-Learning-Recommendation/db_scripts/queries/README.gsql +++ b/In-Database-Machine-Learning-Recommendation/db_scripts/queries/README.gsql @@ -6,7 +6,7 @@ CREATE QUERY README(/* Parameters here */) FOR GRAPH Recommender SYNTAX V2 { correctness of the result. */ -STRING graph_name = "In Database Machine Learning Recommendation"; +STRING name = "In Database Machine Learning Recommendation"; STRING graph_description = "Provides content and products suggestions" + "using an in-database machine learning recommendation system. " + "The recommendation system can predict the movie ratings based " + @@ -27,6 +27,6 @@ STRING recommend = "Outputs the top-10 movies recommended to a user"; STRING cal_avg_rating = "Calculates the average rating across all movies."; -PRINT graph_name, graph_description, query_train_order, query_test_order, splitData, normalization, initialization, training, test, recommend,cal_avg_rating; +PRINT name, graph_description, query_train_order, query_test_order, splitData, normalization, initialization, training, test, recommend,cal_avg_rating; } From 4c228a60720138793d5b9f22679596289fcbe2ec Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Wed, 3 Aug 2022 02:13:06 -0400 Subject: [PATCH 40/93] Update README.gsql --- .../db_scripts/queries/README.gsql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Machine-Learning-and-Real-time-Fraud-Detection/db_scripts/queries/README.gsql b/Machine-Learning-and-Real-time-Fraud-Detection/db_scripts/queries/README.gsql index c79bae2..e8fdbad 100644 --- a/Machine-Learning-and-Real-time-Fraud-Detection/db_scripts/queries/README.gsql +++ b/Machine-Learning-and-Real-time-Fraud-Detection/db_scripts/queries/README.gsql @@ -1,6 +1,6 @@ CREATE QUERY README(/* Parameters here */) FOR GRAPH sdmGraph SYNTAX V2 { -STRING graph_name = "Machine-Learning-and-Real-time-Fraud-Detection"; +STRING name = "Machine-Learning-and-Real-time-Fraud-Detection"; STRING graph_description = "Mobile Industry example for detecting fraud in real-time " + "and generating graph-based features for training the machine learning solution"; @@ -8,6 +8,6 @@ STRING query_order = "No order"; STRING feature_collection = "Returns stable connections given phone id, number of calls, and duration limit."; -PRINT graph_name, graph_description, query_order, feature_collection; +PRINT name, graph_description, query_order, feature_collection; } From e95b2b825dbc95e835e0055db3ae8a712aec0187 Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Wed, 3 Aug 2022 02:14:01 -0400 Subject: [PATCH 41/93] Update README.gsql --- Graph-Convolutional-Networks/db_scripts/queries/README.gsql | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Graph-Convolutional-Networks/db_scripts/queries/README.gsql b/Graph-Convolutional-Networks/db_scripts/queries/README.gsql index 81e42a9..078e28d 100644 --- a/Graph-Convolutional-Networks/db_scripts/queries/README.gsql +++ b/Graph-Convolutional-Networks/db_scripts/queries/README.gsql @@ -1,12 +1,13 @@ CREATE QUERY README(/* Parameters here */) FOR GRAPH CitationGraph SYNTAX V2 { +STRING name = "Graph-Convolutional-Networks"; STRING graph_description = "The recommendation system can predict the movie ratings based on the latent factor (model-based) method. To train the latent factor model, run the queries below in sequence The graph convolutional network (GCN) is applied for node classification. Specifically in this starter kit, it is used to prediction the class of the papers in a citation network The hyperparameters in the GCN model is suggested in Thomas N. Kipf and Max Welling, ICLR (2017). To train the GCN, the order of the queries below must be followed to obtain the useful prediction."; - + STRING query_order = "1. initialization, 2. weight_initialization, 3. training, 4. predicting"; STRING order_note = "To re-train model using different training data split, users can modify the initialization query before repeat the steps above. @@ -21,6 +22,6 @@ STRING training = "Trains the graph convolutional neural network on the training STRING predicting = "Predicts the class of the papers in a citation network and return accuracies."; -PRINT graph_description, query_order, order_note, initialization, weight_initialization, training, predicting; +PRINT name, graph_description, query_order, order_note, initialization, weight_initialization, training, predicting; } From e56327fae9e3cca9f521588be7b04870d309ddd4 Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Wed, 3 Aug 2022 02:18:53 -0400 Subject: [PATCH 42/93] Update README.gsql --- .../db_scripts/queries/README.gsql | 34 ++++++++++++------- 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/Low-Rank-Approximation-Machine-Learning/db_scripts/queries/README.gsql b/Low-Rank-Approximation-Machine-Learning/db_scripts/queries/README.gsql index 2b0b10b..459bcf1 100644 --- a/Low-Rank-Approximation-Machine-Learning/db_scripts/queries/README.gsql +++ b/Low-Rank-Approximation-Machine-Learning/db_scripts/queries/README.gsql @@ -1,17 +1,27 @@ CREATE QUERY README(/* Parameters here */) FOR GRAPH LowRankApproximation SYNTAX V2 { /* -The LowRankApproximation Starter Kit factorize a loaded sparse matrix A into two low-rank matrices U and V such that the matrix product of U and the transpose of V can approximate the original sparse matrix A. The U and V are obtained by minimize the Frobenius norm (or the root mean squer error) of A - U*transpose(V) using gradient descent algorithm. -To compute U and V, run the queries below in sequence - 1. initialization - 2. factorization + The LowRankApproximation Starter Kit factorize a loaded sparse matrix A into + two low-rank matrices U and V such that the matrix product of U and the transpose + of V can approximate the original sparse matrix A. The U and V are obtained by + minimize the Frobenius norm (or the root mean squer error) of + A - U*transpose(V) using gradient descent algorithm. +*/ + +STRING graph_name = "Low-Rank-Approximation-Machine-Learning"; +STRING graph_description = "Implements the low-rank approximation algorithm natively " + + "in-database to deliver personalized recommendations."; +STRING query_order = "1. initialization, 2. factorization, 3. compare_approximation, 4. print_result"; -To compare the approximated matrix with the original matrix, run the queries below -compare_approximation +STRING initialization = "Initializes the row vectors for the matrix U and the matrix V " + + "where elements are initialized by a normal distributed random number generator."; +STRING factorization = "Factorizes the loaded sparse matrix into two low-rank matrices " + + "U and V using gradient descent algorithm." -To print out the matrices U and V, run print_result +//To compare the approximated matrix with the original matrix +STRING compare_approximation = "Outputs the element values of one row of the loaded matrix " + + "specified by the inputed row index together with the approximated element values."; +STRING print_result = "Prints out two matrices U and V factorized from the loaded sparse matrix."; -The order of the queries need to be followed to ensure the correctness of the result. -*/ - - PRINT "README works!"; -} \ No newline at end of file +PRINT name, graph_description, query_order, initialization, factorization, compare_approximation, print_result; + +} From 26cbffab874dd5920069d9c4dc66c18fb55c76d0 Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Wed, 3 Aug 2022 02:27:14 -0400 Subject: [PATCH 43/93] Update jaccard_nbor_reaction.gsql --- .../queries/jaccard_nbor_reaction.gsql | 25 +++++++++++++------ 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/Healthcare-Graph-Drug-Interaction-FAERS/db_scripts/queries/jaccard_nbor_reaction.gsql b/Healthcare-Graph-Drug-Interaction-FAERS/db_scripts/queries/jaccard_nbor_reaction.gsql index 227b1ed..91801e9 100644 --- a/Healthcare-Graph-Drug-Interaction-FAERS/db_scripts/queries/jaccard_nbor_reaction.gsql +++ b/Healthcare-Graph-Drug-Interaction-FAERS/db_scripts/queries/jaccard_nbor_reaction.gsql @@ -3,13 +3,24 @@ CREATE QUERY jaccard_nbor_reaction(VERTEX source, STRING etype SYNTAX v1 { //example: ReportedCase=100640876 /* -Calculates the Jaccard Similarity between a given vertex and every other -vertex. A simplified version of the generic purpose algorithm -jacccard_nbor_ss in the GSQL Graph Data Science Library -https://github.com/tigergraph/gsql-graph-algorithms + Calculates the Jaccard Similarity between a given vertex and every other + vertex. A simplified version of the generic purpose algorithm + jacccard_nbor_ss in the GSQL Graph Data Science Library + https://github.com/tigergraph/gsql-graph-algorithms + + Note: In versions 3.5 and earlier, the SAMPLE clause was only supported in Syntax V1, + so this query uses Syntax V1. The default Syntax V2 may be used in future versions that support SAMPLE. + + Sample inputs: + source: + + Starting from the source vertex: + (1) Get the outdegree set size from the source + (2) Get neighbors of the source + (3) Select others (neighbors of neighbors) and calculate + Jaccard's Similarity + (4) Orders others by similarity to source -Note: In versions 3.5 and earlier, the SAMPLE clause was only supported in Syntax V1, -so this query uses Syntax V1. The default Syntax V2 may be used in future versions that support SAMPLE. */ SumAccum @intersection_Size, @@set_size_A, @set_size_B; @@ -38,4 +49,4 @@ so this query uses Syntax V1. The default Syntax V2 may be used in future versio PRINT Others; PRINT @@t_Size, Others.size(); -} \ No newline at end of file +} From 6d578c9e2bb944459e94625b749334053ee51c74 Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Wed, 3 Aug 2022 02:29:57 -0400 Subject: [PATCH 44/93] Update most_reported_drugs_for_company_v2.gsql --- .../most_reported_drugs_for_company_v2.gsql | 23 +++++++++++++------ 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/Healthcare-Graph-Drug-Interaction-FAERS/db_scripts/queries/most_reported_drugs_for_company_v2.gsql b/Healthcare-Graph-Drug-Interaction-FAERS/db_scripts/queries/most_reported_drugs_for_company_v2.gsql index 37ab2d2..8d92fda 100644 --- a/Healthcare-Graph-Drug-Interaction-FAERS/db_scripts/queries/most_reported_drugs_for_company_v2.gsql +++ b/Healthcare-Graph-Drug-Interaction-FAERS/db_scripts/queries/most_reported_drugs_for_company_v2.gsql @@ -1,26 +1,35 @@ CREATE QUERY most_reported_drugs_for_company_v2(STRING company_name="PFIZER", INT k=5, STRING role="PS") FOR GRAPH faers SYNTAX v2 { - // Possible values for role: PS, SS, I, C - // PS = primary suspect drug, SS = secondary suspect drug - // C = concomitant, I = interacting + + /* + Returns most reported drugs given a company. + + Sample inputs: + role: PS | SS | I | C + PS = primary suspect drug, SS = secondary suspect drug + C = concomitant, I = interacting + + Starting with all pharma companies: + (1) Find all cases where the given pharma company is the 'mfr_sndr' + (2) Find all drug sequences for the selected cases. + (3) Count occurences of each drug mentioned in each drug sequence. + (4) Print top drugs +*/ // Keep count of how many times each drug is mentioned. SumAccum @num_Cases; - // 1. Find all cases where the given pharma company is the 'mfr_sndr' Company = {PharmaCompany.*}; Cases = SELECT c FROM Company:s -(relatedTo:e)- ReportedCase:c WHERE s.mfr_sndr == company_name ; - // 2. Find all drug sequences for the selected cases. DrugSeqs = SELECT ds FROM Cases:c -(hasSequences:e)- DrugSequence:ds WHERE (role == "" OR ds.role_cod == role) ; - // 3. Count occurences of each drug mentioned in each drug sequence. TopDrugs = SELECT d FROM DrugSeqs:ds -(hasDrugs:e)- Drug:d ACCUM d.@num_Cases += 1 @@ -29,4 +38,4 @@ CREATE QUERY most_reported_drugs_for_company_v2(STRING company_name="PFIZER", ; PRINT TopDrugs; -} \ No newline at end of file +} From f9c870206a30cbe979b033f8bec830c717de83fb Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Wed, 3 Aug 2022 02:32:32 -0400 Subject: [PATCH 45/93] Update top_side_effects_for_top_drugs.gsql --- .../top_side_effects_for_top_drugs.gsql | 28 +++++++++++++------ 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/Healthcare-Graph-Drug-Interaction-FAERS/db_scripts/queries/top_side_effects_for_top_drugs.gsql b/Healthcare-Graph-Drug-Interaction-FAERS/db_scripts/queries/top_side_effects_for_top_drugs.gsql index 43fe8f1..82efa91 100644 --- a/Healthcare-Graph-Drug-Interaction-FAERS/db_scripts/queries/top_side_effects_for_top_drugs.gsql +++ b/Healthcare-Graph-Drug-Interaction-FAERS/db_scripts/queries/top_side_effects_for_top_drugs.gsql @@ -1,9 +1,25 @@ CREATE QUERY top_side_effects_for_top_drugs(STRING company_name="PFIZER", INT k=5, STRING role="PS") FOR GRAPH faers SYNTAX v2 { - // Possible values for role: PS, SS, I, C + + /* + Returns most mentioned drugs, the number of reported cases, and their + side effects. + + Sample inputs: + role: PS | SS | I | C // PS = primary suspect drug, SS = secondary suspect drug // C = concomitant, I = interacting + Starting with all pharma companies: + (1) Find all cases where the given pharma company is the 'mfr_sndr' + (2) For each case, attach a list of its reactions. + (3) Find all drug sequences for the selected cases, and transfer + the reaction list to the drug sequence. + (4) Count occurences of each drug mentioned in each drug sequence. + Also count the occurences of each reaction. + (5) Find only the Top K side effects for each selected Drug. +*/ + // Define a heap which sorts the reaction map (below) by count. TYPEDEF TUPLE tally; HeapAccum(k, cnt DESC) @top_Reactions; @@ -13,26 +29,21 @@ CREATE QUERY top_side_effects_for_top_drugs(STRING company_name="PFIZER", SumAccum @num_Cases; MapAccum @reaction_Tally; - // 1. Find all cases where the given pharma company is the 'mfr_sndr' Company = {PharmaCompany.*}; Cases = SELECT c FROM Company:s -(relatedTo:e)- ReportedCase:c WHERE s.mfr_sndr == company_name; - // 2. For each case, attach a list of its reactions. Tally = SELECT r FROM Cases:c -(hasReactions:e)- Reaction:r ACCUM c.@reaction_List += r.pt; - // 3. Find all drug sequences for the selected cases, and transfer - // the reaction list to the drug sequence. + DrugSeqs = SELECT ds FROM Cases:c -(hasSequences:e)- DrugSequence:ds WHERE (role == "" OR ds.role_cod == role) ACCUM ds.@reaction_List = c.@reaction_List; - // 4. Count occurences of each drug mentioned in each drug sequence. - // Also count the occurences of each reaction. TopDrugs = SELECT d FROM DrugSeqs:ds -(hasDrugs:e)- Drug:d ACCUM d.@num_Cases += 1, @@ -42,7 +53,6 @@ CREATE QUERY top_side_effects_for_top_drugs(STRING company_name="PFIZER", ORDER BY d.@num_Cases DESC LIMIT k; - // 5. Find only the Top K side effects for each selected Drug. TopDrugs = SELECT d FROM TopDrugs:d ACCUM @@ -53,4 +63,4 @@ CREATE QUERY top_side_effects_for_top_drugs(STRING company_name="PFIZER", PRINT TopDrugs[TopDrugs.prod_ai, TopDrugs.@num_Cases, TopDrugs.@top_Reactions]; -} \ No newline at end of file +} From 14066957169d6172f9bdfcb3e666d560d8f201e4 Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Wed, 3 Aug 2022 02:35:48 -0400 Subject: [PATCH 46/93] Create README.gsql --- .../db_scripts/queries/README.gsql | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 Healthcare-Graph-Drug-Interaction-FAERS/db_scripts/queries/README.gsql diff --git a/Healthcare-Graph-Drug-Interaction-FAERS/db_scripts/queries/README.gsql b/Healthcare-Graph-Drug-Interaction-FAERS/db_scripts/queries/README.gsql new file mode 100644 index 0000000..78182e3 --- /dev/null +++ b/Healthcare-Graph-Drug-Interaction-FAERS/db_scripts/queries/README.gsql @@ -0,0 +1,14 @@ +CREATE QUERY README(/* Parameters here */) FOR GRAPH faers SYNTAX V2 { + +STRING name = Healthcare-Graph-Drug-Interaction-FAERS"; +STRING graph_description = "Healthcare example focused on public (FAERS) and private data for pharmaceutical drugs"; + +STRING query_order = "No order"; + +STRING jaccard_nbor_reaction = "Calculates the Jaccard Similarity between a given vertex and every other vertex."; +STRING most_reported_drugs_for_company_v2 = "Returns most reported drugs given a company."; +STRING top_side_effects_for_top_drugs = "Returns most mentioned drugs, the number of reported cases, and their side effects."; + +PRINT name, graph_description, query_order, jaccard_nbor_reaction, most_reported_drugs_for_company_v2, top_side_effects_for_top_drugs; + +} From 5189c4ce37f0d59327d20c09cb628284667eca5b Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Wed, 3 Aug 2022 02:35:57 -0400 Subject: [PATCH 47/93] Update README.gsql --- .../db_scripts/queries/README.gsql | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/Healthcare-Graph-Drug-Interaction-FAERS/db_scripts/queries/README.gsql b/Healthcare-Graph-Drug-Interaction-FAERS/db_scripts/queries/README.gsql index 78182e3..4b37951 100644 --- a/Healthcare-Graph-Drug-Interaction-FAERS/db_scripts/queries/README.gsql +++ b/Healthcare-Graph-Drug-Interaction-FAERS/db_scripts/queries/README.gsql @@ -1,14 +1,14 @@ CREATE QUERY README(/* Parameters here */) FOR GRAPH faers SYNTAX V2 { -STRING name = Healthcare-Graph-Drug-Interaction-FAERS"; -STRING graph_description = "Healthcare example focused on public (FAERS) and private data for pharmaceutical drugs"; + STRING name = Healthcare-Graph-Drug-Interaction-FAERS"; + STRING graph_description = "Healthcare example focused on public (FAERS) and private data for pharmaceutical drugs"; -STRING query_order = "No order"; + STRING query_order = "No order"; -STRING jaccard_nbor_reaction = "Calculates the Jaccard Similarity between a given vertex and every other vertex."; -STRING most_reported_drugs_for_company_v2 = "Returns most reported drugs given a company."; -STRING top_side_effects_for_top_drugs = "Returns most mentioned drugs, the number of reported cases, and their side effects."; + STRING jaccard_nbor_reaction = "Calculates the Jaccard Similarity between a given vertex and every other vertex."; + STRING most_reported_drugs_for_company_v2 = "Returns most reported drugs given a company."; + STRING top_side_effects_for_top_drugs = "Returns most mentioned drugs, the number of reported cases, and their side effects."; -PRINT name, graph_description, query_order, jaccard_nbor_reaction, most_reported_drugs_for_company_v2, top_side_effects_for_top_drugs; + PRINT name, graph_description, query_order, jaccard_nbor_reaction, most_reported_drugs_for_company_v2, top_side_effects_for_top_drugs; } From cc1fd595f84fda153c832375719c2004eb818d7d Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Wed, 3 Aug 2022 02:38:56 -0400 Subject: [PATCH 48/93] Update jaccard_nbor_reaction.gsql --- .../db_scripts/queries/jaccard_nbor_reaction.gsql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Healthcare-Graph-Drug-Interaction-FAERS/db_scripts/queries/jaccard_nbor_reaction.gsql b/Healthcare-Graph-Drug-Interaction-FAERS/db_scripts/queries/jaccard_nbor_reaction.gsql index 91801e9..19402e3 100644 --- a/Healthcare-Graph-Drug-Interaction-FAERS/db_scripts/queries/jaccard_nbor_reaction.gsql +++ b/Healthcare-Graph-Drug-Interaction-FAERS/db_scripts/queries/jaccard_nbor_reaction.gsql @@ -12,7 +12,7 @@ CREATE QUERY jaccard_nbor_reaction(VERTEX source, STRING etype so this query uses Syntax V1. The default Syntax V2 may be used in future versions that support SAMPLE. Sample inputs: - source: + source: ReportedCase=100640876 Starting from the source vertex: (1) Get the outdegree set size from the source From 8950d15f65f855b84356407accc199a58e619501 Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Wed, 3 Aug 2022 02:39:10 -0400 Subject: [PATCH 49/93] Update jaccard_nbor_reaction.gsql --- .../db_scripts/queries/jaccard_nbor_reaction.gsql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Healthcare-Graph-Drug-Interaction-FAERS/db_scripts/queries/jaccard_nbor_reaction.gsql b/Healthcare-Graph-Drug-Interaction-FAERS/db_scripts/queries/jaccard_nbor_reaction.gsql index 19402e3..3a75bae 100644 --- a/Healthcare-Graph-Drug-Interaction-FAERS/db_scripts/queries/jaccard_nbor_reaction.gsql +++ b/Healthcare-Graph-Drug-Interaction-FAERS/db_scripts/queries/jaccard_nbor_reaction.gsql @@ -1,7 +1,7 @@ CREATE QUERY jaccard_nbor_reaction(VERTEX source, STRING etype ="hasReactions", INT top_k=100, INT sampSize=100) FOR GRAPH faers SYNTAX v1 { - //example: ReportedCase=100640876 + /* Calculates the Jaccard Similarity between a given vertex and every other vertex. A simplified version of the generic purpose algorithm From 929702d82a257d6edd64299859ae27586418f222 Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Wed, 3 Aug 2022 02:40:03 -0400 Subject: [PATCH 50/93] Update and rename A_README.gsql to README.gsql --- .../db_scripts/queries/{A_README.gsql => README.gsql} | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) rename Graph-Analytics-Shortest-Path-Algorithms/db_scripts/queries/{A_README.gsql => README.gsql} (77%) diff --git a/Graph-Analytics-Shortest-Path-Algorithms/db_scripts/queries/A_README.gsql b/Graph-Analytics-Shortest-Path-Algorithms/db_scripts/queries/README.gsql similarity index 77% rename from Graph-Analytics-Shortest-Path-Algorithms/db_scripts/queries/A_README.gsql rename to Graph-Analytics-Shortest-Path-Algorithms/db_scripts/queries/README.gsql index 68e209f..501e22e 100644 --- a/Graph-Analytics-Shortest-Path-Algorithms/db_scripts/queries/A_README.gsql +++ b/Graph-Analytics-Shortest-Path-Algorithms/db_scripts/queries/README.gsql @@ -1,4 +1,4 @@ -CREATE QUERY A_README(/* Parameters here */) FOR GRAPH MyGraph SYNTAX V2 { +CREATE QUERY README(/* Parameters here */) FOR GRAPH MyGraph SYNTAX V2 { /************************************************************** * IMPORTANT : PLEASE INSTALL AND RUN THE add_weights QUERY @@ -6,4 +6,4 @@ CREATE QUERY A_README(/* Parameters here */) FOR GRAPH MyGraph SYNTAX V2 { *************************************************************/ PRINT "I read it"; -} \ No newline at end of file +} From ebc9b5f57f1d1187a48b2ea42ee79c318ad0ae05 Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Wed, 3 Aug 2022 02:42:41 -0400 Subject: [PATCH 51/93] Update add_weights.gsql --- .../db_scripts/queries/add_weights.gsql | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/Graph-Analytics-Shortest-Path-Algorithms/db_scripts/queries/add_weights.gsql b/Graph-Analytics-Shortest-Path-Algorithms/db_scripts/queries/add_weights.gsql index 7e7e705..8046292 100644 --- a/Graph-Analytics-Shortest-Path-Algorithms/db_scripts/queries/add_weights.gsql +++ b/Graph-Analytics-Shortest-Path-Algorithms/db_scripts/queries/add_weights.gsql @@ -1,7 +1,17 @@ CREATE QUERY add_weights(BOOL overwrite) FOR GRAPH MyGraph SYNTAX V2 { -/* This query uses the haversine formula to calculate the distances -between airports by using their latitude and longitude coordinates. -The calculated distances are measured in miles and are added as edge weights. +/* + Uses the haversine formula to calculate the distances between + airports by using their latitude and longitude coordinates. + The calculated distances are measured in miles and are added as + edge weights. + + Sample Inputs: + overwrite: True | False + + Starting with all airports: + (1) Select target airports and calculate the distance + between source and target if overwrite is True + */ ListAccum @@dont_Change_List; @@ -34,4 +44,4 @@ The calculated distances are measured in miles and are added as edge weights. e.miles = ceil(R * c) END; PRINT @@dont_Change_List; -} \ No newline at end of file +} From 7519dd0be24dbe35af87ba9acd1ba805a4126d58 Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Wed, 3 Aug 2022 02:47:50 -0400 Subject: [PATCH 52/93] Update shortest_ss_no_wt.gsql --- .../db_scripts/queries/shortest_ss_no_wt.gsql | 22 +++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/Graph-Analytics-Shortest-Path-Algorithms/db_scripts/queries/shortest_ss_no_wt.gsql b/Graph-Analytics-Shortest-Path-Algorithms/db_scripts/queries/shortest_ss_no_wt.gsql index c020565..7e5f1f9 100644 --- a/Graph-Analytics-Shortest-Path-Algorithms/db_scripts/queries/shortest_ss_no_wt.gsql +++ b/Graph-Analytics-Shortest-Path-Algorithms/db_scripts/queries/shortest_ss_no_wt.gsql @@ -1,7 +1,21 @@ CREATE QUERY shortest_ss_no_wt(VERTEX source, BOOL display) FOR GRAPH MyGraph SYNTAX V2 { - /* This query is Single-Source Shortest Path without weights on edges. It calculates the shortest distance from the given vertex source to all other connected vertices, and shows one shortest path between them. -The JSON version also show visualization of the network. -The attribute version only store the distance into attribute, not the path. + /* + Single-Source Shortest Path without weights on edges. Calculates the shortest distance + from the given vertex source to all other connected vertices, and shows one shortest + path between them. + + The JSON version also show visualization of the network. + The attribute version only store the distance into attribute, not the path. + + Sample Inputs: + source: airport = "Goroka Airport" + display: True | False + + Start from the source vertex: + (1) Initialize local accumulators + (2) Select connected vertices, calculates shortest distance, and gets shortest + path + (3) Print results if display is True */ MinAccum @dis; @@ -38,4 +52,4 @@ The attribute version only store the distance into attribute, not the path. ACCUM @@edge_Set += e; PRINT @@edge_Set; END; -} \ No newline at end of file +} From f85b76bca6b2201c83d54757e2be448759f40bf1 Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Wed, 3 Aug 2022 02:50:18 -0400 Subject: [PATCH 53/93] Update shortest_ss_pos_wt.gsql --- .../db_scripts/queries/shortest_ss_pos_wt.gsql | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/Graph-Analytics-Shortest-Path-Algorithms/db_scripts/queries/shortest_ss_pos_wt.gsql b/Graph-Analytics-Shortest-Path-Algorithms/db_scripts/queries/shortest_ss_pos_wt.gsql index 3478223..4744f08 100644 --- a/Graph-Analytics-Shortest-Path-Algorithms/db_scripts/queries/shortest_ss_pos_wt.gsql +++ b/Graph-Analytics-Shortest-Path-Algorithms/db_scripts/queries/shortest_ss_pos_wt.gsql @@ -3,9 +3,20 @@ CREATE QUERY shortest_ss_pos_wt (VERTEX source, BOOL display) FOR GRAPH MyGraph and increase the time outdegree */ -/* The Bellman-Ford algorithm for single-Source Shortest Path +/* + The Bellman-Ford algorithm for single-Source Shortest Path on directed/undirected graph with positive weight. It will not detect negative cycle in this algorithm. + + Sample Inputs: + source: airport = "Goroka Airport" + display: True | False + + Start with the source vertex: + (1) Get connected vertices and update local accumulators + (2) Do V-1 iterations: Consider whether each edge lowers the best-known distance. + (3) Calculates shortest paths and displays results if display is True + */ TYPEDEF TUPLE pathTuple; HeapAccum(1, dist ASC) @minPath; @@ -69,4 +80,4 @@ CREATE QUERY shortest_ss_pos_wt (VERTEX source, BOOL display) FOR GRAPH MyGraph ACCUM @@edge_Set += e; PRINT @@edge_Set; END; -} \ No newline at end of file +} From 076a3c7329c8d6144cccb8e9fe81b63cfffad596 Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Wed, 3 Aug 2022 02:52:04 -0400 Subject: [PATCH 54/93] Update shortest_ss_pos_wt_limits.gsql --- .../queries/shortest_ss_pos_wt_limits.gsql | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/Graph-Analytics-Shortest-Path-Algorithms/db_scripts/queries/shortest_ss_pos_wt_limits.gsql b/Graph-Analytics-Shortest-Path-Algorithms/db_scripts/queries/shortest_ss_pos_wt_limits.gsql index 7f6dc21..61f9dba 100644 --- a/Graph-Analytics-Shortest-Path-Algorithms/db_scripts/queries/shortest_ss_pos_wt_limits.gsql +++ b/Graph-Analytics-Shortest-Path-Algorithms/db_scripts/queries/shortest_ss_pos_wt_limits.gsql @@ -1,8 +1,19 @@ CREATE QUERY shortest_ss_pos_wt_limits (VERTEX source, BOOL display, INT maxHops, INT maxDest) FOR GRAPH MyGraph SYNTAX V2 { -/* The Bellman-Ford algorithm for single-Source Shortest Path - on directed/undirected graph with positive weight. - It will not detect negative cycle in this algorithm. +/* + The Bellman-Ford algorithm for single-Source Shortest Path + on directed/undirected graph with positive weight. + It will not detect negative cycle in this algorithm. + + Sample Inputs: + source: airport = "Goroka Airport" + display: True | False + + Start with the source vertex: + (1) Get connected vertices and update local accumulators + (2) Do V-1 iterations: Consider whether each edge lowers the best-known distance. + (3) Calculates shortest paths and displays results if display is True + */ TYPEDEF TUPLE pathTuple; HeapAccum(1, dist ASC) @min_Path; @@ -71,4 +82,4 @@ CREATE QUERY shortest_ss_pos_wt_limits (VERTEX source, BOOL display, INT maxHops ACCUM @@edge_Set += e; PRINT @@edge_Set; END; -} \ No newline at end of file +} From af4ea288a0e5234fc0c3d13f3053f36907d278c3 Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Wed, 3 Aug 2022 02:53:06 -0400 Subject: [PATCH 55/93] Update shortest_ss_pos_wt_limits.gsql --- .../db_scripts/queries/shortest_ss_pos_wt_limits.gsql | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Graph-Analytics-Shortest-Path-Algorithms/db_scripts/queries/shortest_ss_pos_wt_limits.gsql b/Graph-Analytics-Shortest-Path-Algorithms/db_scripts/queries/shortest_ss_pos_wt_limits.gsql index 61f9dba..64a0fec 100644 --- a/Graph-Analytics-Shortest-Path-Algorithms/db_scripts/queries/shortest_ss_pos_wt_limits.gsql +++ b/Graph-Analytics-Shortest-Path-Algorithms/db_scripts/queries/shortest_ss_pos_wt_limits.gsql @@ -2,12 +2,15 @@ CREATE QUERY shortest_ss_pos_wt_limits (VERTEX source, BOOL display, INT maxHops /* The Bellman-Ford algorithm for single-Source Shortest Path - on directed/undirected graph with positive weight. + on directed/undirected graph with positive weight with + limited number of hops and distance. It will not detect negative cycle in this algorithm. Sample Inputs: source: airport = "Goroka Airport" display: True | False + maxHops: 3 + maxDest: 10 Start with the source vertex: (1) Get connected vertices and update local accumulators From 416027aa84f77c15ce2d23924cb90521034ba065 Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Wed, 3 Aug 2022 02:57:53 -0400 Subject: [PATCH 56/93] Update README.gsql --- .../db_scripts/queries/README.gsql | 21 ++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/Graph-Analytics-Shortest-Path-Algorithms/db_scripts/queries/README.gsql b/Graph-Analytics-Shortest-Path-Algorithms/db_scripts/queries/README.gsql index 501e22e..00f6fd3 100644 --- a/Graph-Analytics-Shortest-Path-Algorithms/db_scripts/queries/README.gsql +++ b/Graph-Analytics-Shortest-Path-Algorithms/db_scripts/queries/README.gsql @@ -1,9 +1,20 @@ CREATE QUERY README(/* Parameters here */) FOR GRAPH MyGraph SYNTAX V2 { - /************************************************************** - * IMPORTANT : PLEASE INSTALL AND RUN THE add_weights QUERY - * BEFORE RUNNING OTHER QUERIES - *************************************************************/ + STRING name = "Graph-Analytics-Shortest-Path-Algorithms"; + STRING graph_description = "Identify the path through your network with the fewest number of hops."; - PRINT "I read it"; + STRING query_order = "1. add_weights, No order"; + + STRING add_weights = "Uses the haversine formula to calculate the distances between " + + "airports by using their latitude and longitude coordinates."; + STRING shortest_ss_no_wt = "Single-Source Shortest Path without weights on edges. " + + "Calculates the shortest distance from the given vertex source to all other " + + "connected vertices, and shows one shortest path between them."; + STRING shortest_ss_pos_wt = "The Bellman-Ford algorithm for single-Source Shortest Path " + + "on directed/undirected graph with positive weight"; + STRING shortest_ss_pos_wt_limits = "The Bellman-Ford algorithm for single-Source Shortest " + + "Path on directed/undirected graph with positive weight with limited number of hops " + + "and distance."; + + PRINT name, graph_description, query_order, add_weights, shortest_ss_no_wt, shortest_ss_pos_wt, shortest_ss_pos_wt_limits; } From 0b61cc6705f298565eb17ec9fdc5edb6b0b44fa6 Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Wed, 3 Aug 2022 03:02:08 -0400 Subject: [PATCH 57/93] Update initialize_users.gsql --- .../db_scripts/queries/initialize_users.gsql | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/initialize_users.gsql b/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/initialize_users.gsql index 9d0a247..7f05595 100644 --- a/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/initialize_users.gsql +++ b/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/initialize_users.gsql @@ -1,7 +1,13 @@ CREATE QUERY initialize_users() FOR GRAPH Entity_Resolution SYNTAX v2 { -// Create a user vertex for each account and connecs the attributes -// (IP, Email, Device, Phone, Last_Name, address) of the account to the user. - +/* + Create a user vertex for each account and connects the attributes + (IP, Email, Device, Phone, Last_Name, address) of the account to the user. + + No inputs + + (1) Initialize each account with a user + (2)Connect the User to all the attributes of their account +*/ // Initialize each account with a user Accounts = SELECT s FROM Account:s WHERE s.outdegree("Has_Account")==0 @@ -35,4 +41,4 @@ CREATE QUERY initialize_users() FOR GRAPH Entity_Resolution SYNTAX v2 { INSERT INTO User_Address VALUES(s.id, attr); // Note: Insertions will not be visible until after the query completes. PRINT "s1_initialize_users: Done" AS endMsg; -} \ No newline at end of file +} From ae112d687aee265b1b7238d1a67d1dfcc0edb37d Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Thu, 4 Aug 2022 11:49:24 -0400 Subject: [PATCH 58/93] Update util_count_vertices.gsql --- .../db_scripts/queries/util_count_vertices.gsql | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/util_count_vertices.gsql b/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/util_count_vertices.gsql index ea03064..1a17f1e 100644 --- a/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/util_count_vertices.gsql +++ b/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/util_count_vertices.gsql @@ -1,8 +1,17 @@ CREATE QUERY util_count_vertices(STRING v_type="User") FOR GRAPH Entity_Resolution SYNTAX V2 { +/* + Counts vertices given type + + No Inputs + + Start from vertex set v_type: + (1) Count the number of vertices in v_type + +*/ SumAccum @@v_count; Source = {v_type}; H = SELECT v FROM Source:v ACCUM @@v_count += 1; PRINT @@v_count; -} \ No newline at end of file +} From 2632488232f4c9c7cc24c22fc561f277eedd572f Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Thu, 4 Aug 2022 11:51:03 -0400 Subject: [PATCH 59/93] Update util_delete_users.gsql --- .../db_scripts/queries/util_delete_users.gsql | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/util_delete_users.gsql b/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/util_delete_users.gsql index f755d9e..fc99ee3 100644 --- a/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/util_delete_users.gsql +++ b/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/util_delete_users.gsql @@ -1,4 +1,15 @@ CREATE QUERY util_delete_users(bool are_you_sure=FALSE) FOR GRAPH Entity_Resolution SYNTAX V2 { + +/* + Deletes all users + + No inputs + + If are_you_sure is True: + (1) Selects and deletes all Users + (2) Prints action taken + +*/ IF are_you_sure THEN All_users = {User.*}; @@ -8,4 +19,4 @@ CREATE QUERY util_delete_users(bool are_you_sure=FALSE) FOR GRAPH Entity_Resolut ELSE PRINT "No action taken" AS endMsg; END; -} \ No newline at end of file +} From 58edbdac091a28d6c9ba368b31796caee99d09a5 Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Thu, 4 Aug 2022 11:51:54 -0400 Subject: [PATCH 60/93] Update util_print_vertices.gsql --- .../db_scripts/queries/util_print_vertices.gsql | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/util_print_vertices.gsql b/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/util_print_vertices.gsql index 96023a8..d26c0d4 100644 --- a/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/util_print_vertices.gsql +++ b/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/util_print_vertices.gsql @@ -1,6 +1,15 @@ CREATE QUERY util_print_vertices (STRING v_type="Weights") FOR GRAPH Entity_Resolution SYNTAX v2 { + +/* + Get all vertices of given type + + No inputs + + Selects all vertices of given type and prints the set + +*/ Vertices = {v_type}; // Get all vertices of type v_typpe PRINT Vertices; -} \ No newline at end of file +} From c30a34bd5a5cdff76f539180bc03b185afca70ac Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Thu, 4 Aug 2022 11:53:50 -0400 Subject: [PATCH 61/93] Update util_set_weights.gsql --- .../db_scripts/queries/util_set_weights.gsql | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/util_set_weights.gsql b/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/util_set_weights.gsql index 6435b92..7011ba4 100644 --- a/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/util_set_weights.gsql +++ b/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/util_set_weights.gsql @@ -3,6 +3,15 @@ CREATE QUERY util_set_weights( DOUBLE last_name_wt=0.75, DOUBLE address_wt=0.5, DOUBLE device_wt=0.5) FOR GRAPH Entity_Resolution SYNTAX v2{ + /* + Sets all weights + + No inputs + + Sets weights from default parameters and stores in weight vertex + + */ + MapAccum @@wt_map; @@wt_map += ("User_IP" -> ip_wt); @@wt_map += ("User_Email" -> email_wt); @@ -15,4 +24,4 @@ CREATE QUERY util_set_weights( POST-ACCUM w.wt_map = @@wt_map; PRINT "init_weight_vertex: Done" AS endMsg; -} \ No newline at end of file +} From daab09fd6818288446f35279910fdd4dfbc0d45f Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Thu, 4 Aug 2022 11:56:15 -0400 Subject: [PATCH 62/93] Update connect_jaccard_sim.gsql --- .../queries/connect_jaccard_sim.gsql | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/connect_jaccard_sim.gsql b/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/connect_jaccard_sim.gsql index 330b7d6..2decb12 100644 --- a/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/connect_jaccard_sim.gsql +++ b/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/connect_jaccard_sim.gsql @@ -2,9 +2,16 @@ CREATE QUERY connect_jaccard_sim (FLOAT threshold=0.5, INT topK=100, BOOL verbose=FALSE) FOR GRAPH Entity_Resolution SYNTAX V2 { /* -Calculate Jaccard similarity between each vertex and every other vertex. - Jaccard(set A, set B) = overlap_size / (size_A + size_B - overlap_size) + Calculate Jaccard similarity between each vertex and every other vertex. + Jaccard(set A, set B) = overlap_size / (size_A + size_B - overlap_size) + + No inputs + + (1) Calculate the number of eligible neighbors of each vertex + (2) Find paths from UserA->neighbor->UserB: count A&B's common neighbors + (3) Calculate Jaccard(A,B). Keep the scores > threshold */ + TYPEDEF TUPLE SimilarityTuple; MapAccum @@deg; // degree of each VERTEX MapAccum, INT> @intersection; // num neighbors in common @@ -15,13 +22,13 @@ Calculate Jaccard similarity between each vertex and every other vertex. @@etype_list += ["User_Last_Name","User_Address","User_Device"]; IF verbose THEN PRINT @@etype_list; END; - // Calculate the number of eligible neighbors of each vertex + Start = SELECT s FROM User:s ACCUM FOREACH e IN @@etype_list DO @@deg += (s -> s.outdegree(e)) END; - // Find paths from UserA->neighbor->UserB: count A&B's common neighbors + Others = SELECT B FROM User:A -()- (IP|Email|Phone|Last_Name|Address|Device):n -()- User:B @@ -29,9 +36,10 @@ Calculate Jaccard similarity between each vertex and every other vertex. ACCUM A.@intersection += (B -> 1), // tally each path A->B @@path_count += 1; + IF verbose THEN PRINT @@path_count; END; - // Calculate Jaccard(A,B). Keep the scores > threshold + Result = SELECT A FROM User:A ACCUM FOREACH (B, overlap) IN A.@intersection DO FLOAT score = overlap*1.0/(@@deg.get(A) + @@deg.get(B) - overlap), @@ -46,4 +54,4 @@ Calculate Jaccard similarity between each vertex and every other vertex. PRINT @@jaccard_heap; PRINT to_string(@@insert_count) + " SameAs edges inserted" AS endMsg; -} \ No newline at end of file +} From 31231d16f2911067997f72f6c62763b83d267fcc Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Thu, 4 Aug 2022 12:31:58 -0400 Subject: [PATCH 63/93] Update merge_connected_users.gsql --- .../queries/merge_connected_users.gsql | 28 +++++++++++++------ 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/merge_connected_users.gsql b/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/merge_connected_users.gsql index 036750b..f4b2342 100644 --- a/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/merge_connected_users.gsql +++ b/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/merge_connected_users.gsql @@ -1,11 +1,21 @@ CREATE QUERY merge_connected_users(FLOAT threshold=1.0, BOOL verbose=FALSE) FOR GRAPH Entity_Resolution SYNTAX V2 { -/*Connect users having sufficient shared attributes. The linking score between - Group users connected by SameAs edges: - 1. Find connected users using the connected component algorithm. - 2. In each component, select a lead user. - 3. In each component, connect all attributes from other users to the lead user - 4. Delete the users that are not the lead user. - */ +/* + Connect users having sufficient shared attributes using the connected + component algorithm. The linking score between Group users is connected + by SameAs edges. + + + No inputs + + Starting with all Users: + (1) Initialize each user with itself as the lead of the component + (2) Assign the min vertex ID of a connected component to every other member. + (3) The rest of the query merge all the users in each connected component into one vertex. + (4) Transfer each of the Attribute vertices (IPs, Emails, Phones, Last_Names, + Addresses, Devices) to the lead user. + (5) Delete the non-lead User vertices + + */ // MinAccum selects the vertex with the minimum internal ID MinAccum> @min_user_id; @@ -84,7 +94,7 @@ CREATE QUERY merge_connected_users(FLOAT threshold=1.0, BOOL verbose=FALSE) FOR VERTEX lead_usr = s.@min_user_id, INSERT INTO User_Device VALUES (lead_usr, t), DELETE (e) - ; + ; // 4. Delete the non-lead User vertices Not_lead = SELECT s FROM Not_lead:s POST-ACCUM @@ -92,4 +102,4 @@ CREATE QUERY merge_connected_users(FLOAT threshold=1.0, BOOL verbose=FALSE) FOR // Print whether any grouping has been performed PRINT converged; -} \ No newline at end of file +} From 996a603681816dceb14db81834a3fad2ce5f45b2 Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Thu, 4 Aug 2022 22:19:20 -0400 Subject: [PATCH 64/93] Update score_similar_attributes.gsql --- .../queries/score_similar_attributes.gsql | 36 +++++++++++++------ 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/score_similar_attributes.gsql b/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/score_similar_attributes.gsql index eb88156..d128550 100644 --- a/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/score_similar_attributes.gsql +++ b/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/score_similar_attributes.gsql @@ -2,10 +2,23 @@ CREATE QUERY score_similar_attributes( bool do_last_name=TRUE, bool do_address=TRUE, bool print_only=FALSE) FOR GRAPH Entity_Resolution SYNTAX v2 { - /* Considering only User-User pairs where this is already some match - of attribute values, compare their names and their addresses using - JaroWinkler distance (score [0,1] for [nothing in common,identical]). - Use this to add a prorated weight to their existing similarity scores. + + /* + Considering only User-User pairs where this is already some match + of attribute values, compare their names and their addresses using + JaroWinkler distance (score [0,1] for [nothing in common,identical]). + Use this to add a prorated weight to their existing similarity scores. + + No inputs + + (1) Get weights of Last_Name and Address from the global Weight vertex + (2) Find all linked users (order doesn't matter), plus each user's last name + (3) If names aren't identical compute JaroWinkler * weight + (4) Find all linked users (order doesn't matter), plus each user's address + (5) If addresses aren't identical compute JaroWinkler * weight + (6) Add the new similarity scores to the existing scores + + */ TYPEDEF TUPLE String_pair; @@ -16,7 +29,7 @@ SYNTAX v2 { FLOAT name_wt = 0.0; FLOAT addr_wt = 0.0; - // Get weights of Last_Name and Address from the global Weight vertex + Wt = SELECT w FROM Weights:w POST-ACCUM IF do_last_name THEN @@ -24,14 +37,15 @@ SYNTAX v2 { IF do_address THEN addr_wt = w.wt_map.get("User_Address") END ; + + // last name Connected_users = SELECT A - // Find all linked users, plus each user's last name FROM User:A -(SameAs:e)- User:B, User:A -()- Last_Name:A_name, User:B -()- Last_Name:B_name WHERE A.id < B.id // filter so we don't count (A,B) & (B,A) ACCUM @@name_match += 1, - // If names aren't identical compute JaroWinkler * weight + IF do_last_name AND A_name.val != B_name.val THEN FLOAT sim = jaroWinklerDistance(A_name.id,B_name.id) * name_wt, @@sim_score += (A -> (B -> sim)), @@ -39,14 +53,15 @@ SYNTAX v2 { IF sim != 0 THEN @@name_update += 1 END END ; + + + // address Connected_users = SELECT A - // Find all linked users, plus each user's address FROM Connected_users:A -(SameAs:e)- User:B, User:A -()- Address:A_addr, User:B -()- Address:B_addr WHERE A.id < B.id // filter so we don't count (A,B) & (B,A) ACCUM @@addr_match += 1, - // If addresses aren't identical compute JaroWinkler * weight IF do_address AND A_addr.val != B_addr.val THEN FLOAT sim = jaroWinklerDistance(A_addr.id,B_addr.id) * addr_wt, @@sim_score += (A -> (B -> sim)), @@ -54,7 +69,6 @@ SYNTAX v2 { IF sim != 0 THEN @@addr_update += 1 END END ; - // Add the new similarity scores to the existing scores IF NOT print_only THEN Connected_users = SELECT A @@ -69,4 +83,4 @@ SYNTAX v2 { PRINT @@sim_score.size() AS num_scores; PRINT @@string_pairs; PRINT @@sim_score; -} \ No newline at end of file +} From caaa051395512f83ef657dbdc093e187c0cb8e2d Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Thu, 4 Aug 2022 22:21:19 -0400 Subject: [PATCH 65/93] Update connect_weighted_match.gsql --- .../queries/connect_weighted_match.gsql | 25 ++++++++++++------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/connect_weighted_match.gsql b/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/connect_weighted_match.gsql index 67cb065..321be25 100644 --- a/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/connect_weighted_match.gsql +++ b/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/connect_weighted_match.gsql @@ -1,20 +1,29 @@ CREATE QUERY connect_weighted_match(float threshold=0.2, bool verbose=false) FOR GRAPH Entity_Resolution SYNTAX v2{ /* - Connect users that have sufficient shared attributes. The linking score between - two users is defined as the weighted sum of their shared attributes. - Two users will be linked if the linking sc ore is above the threshold. + Connect users that have sufficient shared attributes. The linking score between + two users is defined as the weighted sum of their shared attributes. + Two users will be linked if the linking sc ore is above the threshold. + + No inputs + + (1) Copy Weights map to a global accumulator, so it's always available + (2) For each attribute connected to users, store the weight to each + user in the score map + (3) For each pair of users connected via attributes, aggregate all attribute + weights. + (4) Connect the users with a SameAs edge if the score > threshold. + */ MapAccum, SumAccum> @score; MapAccum @@wt_map; SumAccum @@insert_count, @@attr_count; - // Copy Weights map to a global accumulator, so it's always available + Wt = SELECT w FROM Weights:w POST-ACCUM @@wt_map += w.wt_map; IF verbose THEN PRINT @@wt_map; END; - // For each attribute connected to users, - // store the weight to each user in the score map + Attributes = SELECT attr FROM User:usr -((User_IP|User_Email|User_Last_Name|User_Phone|User_Address|User_Device):e)- :attr ACCUM @@ -22,8 +31,6 @@ CREATE QUERY connect_weighted_match(float threshold=0.2, bool verbose=false) FOR attr.@score += (usr -> @@wt_map.get(e.type)); IF verbose THEN PRINT @@attr_count; END; - // For each pair of users connected via attributes, aggregate all attribute - // weights. Connect the users with a SameAs edge if the score > threshold. Attrs = SELECT attr FROM Attributes:attr -((User_IP|User_Email|User_Last_Name|User_Phone|User_Address|User_Device):e)- :usr ACCUM @@ -38,4 +45,4 @@ CREATE QUERY connect_weighted_match(float threshold=0.2, bool verbose=false) FOR ; PRINT to_string(@@insert_count) + " SameAs edges inserted; " + "s2_connect_weighted_match: Done" AS endMsg; -} \ No newline at end of file +} From d94a74a7ecc4f02f4ac63171ee591cde6506b5ee Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Thu, 4 Aug 2022 22:23:56 -0400 Subject: [PATCH 66/93] Update recommend_videos.gsql --- .../db_scripts/queries/recommend_videos.gsql | 41 +++++++++++-------- 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/recommend_videos.gsql b/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/recommend_videos.gsql index 67d44f8..32923cf 100644 --- a/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/recommend_videos.gsql +++ b/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/recommend_videos.gsql @@ -1,16 +1,25 @@ CREATE QUERY recommend_videos(vertex inputAcc, int k=5) FOR GRAPH Entity_Resolution SYNTAX v2 { - /* Recommend videos to User: - 1. Find all the accounts related to the input account - (according to the entity resolution). - 2. Find all the videos these accounts have played. - 3. Find all the un-watched videos which have the most genres - or keywords in common with the played videos. - ********************************************************* - * Example parameters: - * Account : 407 - * k : 5 - **********************************************************/ + +/* + Recommend videos to User: + 1. Find all the accounts related to the input account + (according to the entity resolution). + 2. Find all the videos these accounts have played. + 3. Find all the un-watched videos which have the most genres + or keywords in common with the played videos. + + Sample inputs: + Account : 407 + k : 5 + + Start from the input account inputAcc: + (1) Get all the accounts linked through the same user as the source account + (2) Get the videos played by the accounts of interest + (3) Tag each genre or keyword of a video played by this User + (4) Count genres or keywords an unwatched video has in common with tagged videos + (5) Show connections (edges) to the features of the recommended videos +*/ SetAccum> @@connected_accts; MapAccum @map; @@ -20,27 +29,24 @@ FOR GRAPH Entity_Resolution SYNTAX v2 { Source_acct = {inputAcc}; - // Get all the accounts linked through the same user as the source account + Related_accts = SELECT acct FROM Source_acct:s -(Has_Account:e1)- User:u -(Has_Account:e2)- Account:acct ACCUM @@edge_list += e1, @@edge_list += e2; - // This block is just for collecting edges to display Attributes = SELECT attr FROM Related_accts:s-((Has_IP|Has_Email|Has_Last_Name|Has_Phone|Has_Address|Has_Device):e)-:attr ACCUM @@edge_list += e; - // Get the videos played by the accounts of intereset Played_vids = SELECT t FROM Related_accts:s -(Has_Play_Event:e1)- :v -(Play_Video:e2)- :t ACCUM t.@cnt += 1, // tag each video played by this User @@edge_list += e1, @@edge_list += e2; - // Tag each genre or keyword of a video played by this User Video_features = SELECT t FROM Played_vids:s-((Has_Genre|Has_Keyword):e)-:t ACCUM t.@cnt += s.@cnt, @@edge_list += e; - // Count genres or keywords an unwatched video has in common with tagged videos + Recommended_vids = SELECT t FROM Video_features:s-((Has_Genre|Has_Keyword):e)-:t WHERE t.@cnt == 0 ACCUM t.@cnt += s.@cnt, t.@map += (s->s.@cnt) @@ -48,9 +54,8 @@ FOR GRAPH Entity_Resolution SYNTAX v2 { PRINT Recommended_vids; - // Show connections (edges) to the features of the recommended videos Recommended_vids = SELECT s FROM Recommended_vids:s-((Has_Genre|Has_Keyword):e)-:t ACCUM @@edge_list += e; PRINT @@edge_list; -} \ No newline at end of file +} From dfa7d019220f6f8034814e2d2f9269cffae0f8ab Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Thu, 4 Aug 2022 22:49:00 -0400 Subject: [PATCH 67/93] Update README.gsql --- .../db_scripts/queries/README.gsql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/README.gsql b/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/README.gsql index f8b7d47..d3f7460 100644 --- a/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/README.gsql +++ b/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/README.gsql @@ -19,7 +19,7 @@ CREATE QUERY README() FOR GRAPH Entity_Resolution SYNTAX V2 { Method 2: Exact and approximate matching 1a. Run initialize_users to create a tentative User linked to each Account. 1b. Run util_set_weights to load weights used to calibrate the scoring. - 2a. Run connect_weighted_match to link Users whose matching attribute + 2a. Run connect_weighted_match to link Users whose matching attribute values score enough points. 2b. Run score_similar_attributes to add additional points for approximate matches. @@ -42,4 +42,4 @@ CREATE QUERY README() FOR GRAPH Entity_Resolution SYNTAX V2 { with the videos already watched by this user. */ PRINT version; -} \ No newline at end of file +} From b92260203926612164dccea4bf8a0cf917ea1330 Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Thu, 4 Aug 2022 23:02:59 -0400 Subject: [PATCH 68/93] Update README.gsql --- .../db_scripts/queries/README.gsql | 28 +++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/README.gsql b/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/README.gsql index d3f7460..196dac4 100644 --- a/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/README.gsql +++ b/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/README.gsql @@ -35,11 +35,35 @@ CREATE QUERY README() FOR GRAPH Entity_Resolution SYNTAX V2 { *** Once the Entity Resolution is complete, you can see some of the results: - get_entity_subgraph: finds the User of a given Account, the other + get_account_subgraph: finds the User of a given Account, the other Accounts of that User, and the attribute vertices of this User. recommend_videos: find all the accounts linked via entity resolution to the input account, then list videos that have the most features in common with the videos already watched by this user. */ - PRINT version; + + STRING name = "In-Database-Machine-Learning-for-Big-Data-Entity-Resolution"; + STRING graph_description = "Finds Accounts that share many of the same or similar" + + "personal attributes and therefore seem to represent the same User."; + + STRING Jaccard_similarity_order = "1. initialize_users, 2. connect_jaccard_sim, 3. merge_connected_users, 4. repeat 2. and 3."; + STRING Exact_and_approximate_matching_order = "1.initialize_users, 2. util_set_weights, 3. connect_weighted_match, " + + "4. score_similar_attributes, 5. merge_similar_users, 6. repeat 3,4,5"; + + STRING initialize_users = "Create a user vertex for each account and connects the attributes of the account to the user."; + STRING connect_jaccard_sim = "Calculate Jaccard similarity between each vertex and every other vertex."; + STRING merge_connected_users = "Connect users having sufficient shared attributes."; + STRING util_set_weights = "Sets all weights to calibrate the scoring."; + STRING connect_weighted_match = "Connect users that have sufficient shared attributes."; + STRING score_similar_attributes = "Considering only User-User pairs where this is already some match of attribute values, " + + compare their names and their addresses using JaroWinkler distance."; + STRING merge_similar_users = "N/A"; + STRING get_account_subgraph = "Get subgraph given set of account ids"; + STRING recommend_videos = "Recommend videos to User."; + + + PRINT name, version, graph_description, Jaccard_similarity_order, Exact_and_approximate_matching_order; + PRINT initialize_users, connect_jaccard_sim, merge_connected_users; + PRINT util_set_weights, connect_weighted_match, score_similar_attributes, merge_similar_users; + PRINT get_account_subgraph, recommend_videos; } From d2268e72ec8cdbfddc22202a7a06b1537657584b Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Thu, 4 Aug 2022 23:03:39 -0400 Subject: [PATCH 69/93] Update README.gsql --- .../db_scripts/queries/README.gsql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/README.gsql b/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/README.gsql index 196dac4..cfa4f88 100644 --- a/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/README.gsql +++ b/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/README.gsql @@ -59,7 +59,7 @@ CREATE QUERY README() FOR GRAPH Entity_Resolution SYNTAX V2 { compare their names and their addresses using JaroWinkler distance."; STRING merge_similar_users = "N/A"; STRING get_account_subgraph = "Get subgraph given set of account ids"; - STRING recommend_videos = "Recommend videos to User."; + STRING recommend_videos = "Recommend videos to User by most genres or keywords in common with the played videos."; PRINT name, version, graph_description, Jaccard_similarity_order, Exact_and_approximate_matching_order; From 2648e91178cf2a85064ad72f4980158dc46f02a1 Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Thu, 4 Aug 2022 23:04:47 -0400 Subject: [PATCH 70/93] Update get_account_subgraph.gsql --- .../db_scripts/queries/get_account_subgraph.gsql | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/get_account_subgraph.gsql b/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/get_account_subgraph.gsql index 3166090..67f0026 100644 --- a/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/get_account_subgraph.gsql +++ b/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/get_account_subgraph.gsql @@ -1,6 +1,17 @@ CREATE QUERY get_account_subgraph(SET account_ids, BOOL include_attributes=FALSE) FOR GRAPH Entity_Resolution SYNTAX v2 { + /* + Get subgraph given set of account ids + + Sample inputs: + account_ids: 1 | 2 | 3 + + (1) Get vertex set from input set + (2) Select users connected to accounts set and collect + Has_Account edges + */ + ListAccum @@edges_to_display; INT numAccounts; INT numUsers; @@ -11,4 +22,4 @@ CREATE QUERY get_account_subgraph(SET account_ids, BOOL include_attribut PRINT "get_account_subgraph works!"; -} \ No newline at end of file +} From 25f20993535df7587e146da218553f940014a9c5 Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Thu, 4 Aug 2022 23:05:32 -0400 Subject: [PATCH 71/93] Update and rename A_README.gsql to README.gsql --- .../db_scripts/queries/{A_README.gsql => README.gsql} | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) rename Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/{A_README.gsql => README.gsql} (77%) diff --git a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/A_README.gsql b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/README.gsql similarity index 77% rename from Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/A_README.gsql rename to Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/README.gsql index 00f65e1..f824b72 100644 --- a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/A_README.gsql +++ b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/README.gsql @@ -1,4 +1,4 @@ -CREATE QUERY A_README() FOR GRAPH MyGraph { +CREATE QUERY README() FOR GRAPH MyGraph { /* IMPORANT : PLEASE INSTALL AND RUN the insert_all_referrals QUERY FIRST. @@ -6,4 +6,4 @@ CREATE QUERY A_README() FOR GRAPH MyGraph { */ print "I read this!"; -} \ No newline at end of file +} From e25a54b28154dcd3dc4263ab411df9d094e0280f Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Thu, 4 Aug 2022 23:10:11 -0400 Subject: [PATCH 72/93] Update Print_community.gsql --- .../db_scripts/queries/Print_community.gsql | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/Print_community.gsql b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/Print_community.gsql index 1bc96d4..b88b196 100644 --- a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/Print_community.gsql +++ b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/Print_community.gsql @@ -1,6 +1,18 @@ CREATE QUERY print_community(vertex input_prescriber) FOR GRAPH MyGraph SYNTAX V2 { - /* Write query logic here */ - //PRINT "Print_community works!"; + /* + + Returns edges of community given prescriber + + Sample input: + input_prescriber: pre14 | pre25 + + Using all Prescribers: + (1) Select presribers where their community id + matches the input_prescriber and the + prescriber they refer + (2) Return referral edges of the community + + */ ListAccum @@edge_list; @@ -17,4 +29,4 @@ CREATE QUERY print_community(vertex input_prescriber) FOR GRAPH MyGr print start; print @@edge_list; -} \ No newline at end of file +} From afef34ede37ac783c0608b6c467215f06feb144b Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Thu, 4 Aug 2022 23:38:00 -0400 Subject: [PATCH 73/93] Update algo_louvain.gsql --- .../db_scripts/queries/algo_louvain.gsql | 31 +++++++++++++------ 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/algo_louvain.gsql b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/algo_louvain.gsql index b373b5d..fb3c772 100644 --- a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/algo_louvain.gsql +++ b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/algo_louvain.gsql @@ -2,13 +2,27 @@ CREATE QUERY algo_louvain(INT iter1 = 10, INT iter2 = 10, INT iter3 = 10, INT sp Bool sort_by_pre_ID, Bool sort_by_comm_ID) FOR GRAPH MyGraph SYNTAX V2 { /* -* Louvain Method with Parallelism and Refinement -* https://arxiv.org/pdf/1304.4453 -* The minimum label heuristics are implemented: https://doi.org/10.1016/j.parco.2015.03.003 -* iter: There are three phases in the algorithm -- move, merge and refine. Their max number of iterations are set by iter1, iter2, iter3 respectively. -* split: To save memory, split number is 10 by default. When the split number is larger, the query is closer to sequential Louvain Method, which is slower. When the split number is 1, the query is parallel, but requires more memory. -* output_level: 0, only list number; 1, also list members -* fComm, fDist: files to store community label and community distribution + Louvain Method with Parallelism and Refinement + https://arxiv.org/pdf/1304.4453 + The minimum label heuristics are implemented: https://doi.org/10.1016/j.parco.2015.03.003 + + Inputs: + iter: There are three phases in the algorithm -- move, merge and refine. Their max number of iterations are set by iter1, iter2, iter3 respectively. + split: To save memory, split number is 10 by default. When the split number is larger, the query is closer to sequential Louvain Method, which is slower. When the split number is 1, the query is parallel, but requires more memory. + output_level: 0, only list number; 1, also list members + fComm, fDist: files to store community label and community distribution + + (1) Initialize: count edges and set a unique cluster ID for each vertex + (2) Phase 1 -- Move + (a) For each vertex, calculate the change in modularity FROM adding it to each of the nearby clusters + (b) Add vertex to cluster with highest positive change in modularity + (c) Repeat the above until no vertices change cluster anymore + (3) Phase 2 -- Merge + (a) Select the vertices with minimal internal id to represent the coarsened graph + (b) Get @cweight from totalIncident + (c) Calculate.num_patient incident from vertex to cluster in coarsened graph; change every interation + (4) Phase 3 -- Refinement: run the first phase again on each vertex to do some small adjustments for the resulting communities + */ TYPEDEF TUPLE Cluster_Num; TYPEDEF TUPLE V_Delta_Q; @@ -242,7 +256,6 @@ CREATE QUERY algo_louvain(INT iter1 = 10, INT iter2 = 10, INT iter3 = 10, INT sp log(debug > 0, "[redrain]#2_merge", iteration2, @@modularity2); END; // outer WHILE - # Phase 3 -- Refinement iteration = 0; @@modularity = 0; @@ -317,4 +330,4 @@ CREATE QUERY algo_louvain(INT iter1 = 10, INT iter2 = 10, INT iter3 = 10, INT sp END; PRINT start [start.communityId]; PRINT "Community Detection Done"; -} \ No newline at end of file +} From dfff8c7328c1ad9f5f6f238689b1280bf07f093b Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Thu, 4 Aug 2022 23:41:17 -0400 Subject: [PATCH 74/93] Update algo_louvain_enhanced.gsql --- .../queries/algo_louvain_enhanced.gsql | 30 ++++++++++++++----- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/algo_louvain_enhanced.gsql b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/algo_louvain_enhanced.gsql index dcb0cfc..a822637 100644 --- a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/algo_louvain_enhanced.gsql +++ b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/algo_louvain_enhanced.gsql @@ -4,13 +4,27 @@ CREATE QUERY algo_louvain_enhanced(STRING vertex_type, STRING edge_type, FOR GRAPH MyGraph SYNTAX V2 { /* -* Louvain Method with Parallelism and Refinement -* https://arxiv.org/pdf/1304.4453 -* The minimum label heuristics are implemented: https://doi.org/10.1016/j.parco.2015.03.003 -* iter: There are three phases in the algorithm -- move, merge and refine. Their max number of iterations are set by iter1, iter2, iter3 respectively. -* split: To save memory, split number is 10 by default. When the split number is larger, the query is closer to sequential Louvain Method, which is slower. When the split number is 1, the query is parallel, but requires more memory. -* output_level: 0, only list number; 1, also list members -* fComm, fDist: files to store community label and community distribution + Louvain Method with Parallelism and Refinement + https://arxiv.org/pdf/1304.4453 + The minimum label heuristics are implemented: https://doi.org/10.1016/j.parco.2015.03.003 + + Inputs: + iter: There are three phases in the algorithm -- move, merge and refine. Their max number of iterations are set by iter1, iter2, iter3 respectively. + split: To save memory, split number is 10 by default. When the split number is larger, the query is closer to sequential Louvain Method, which is slower. When the split number is 1, the query is parallel, but requires more memory. + output_level: 0, only list number; 1, also list members + fComm, fDist: files to store community label and community distribution + + + (1) Initialize: count edges and set a unique cluster ID for each vertex + (2) Phase 1 -- Move: incrementally calculates the modularity change of moving a vertex into every other community + and moves the vertex to the community with the highest modularity change + (a) For each vertex, calculate the change in modularity FROM adding it to each of the nearby clusters + (b) Add vertex to cluster with highest positive change in modularity + (c) Repeat the above until no vertices change cluster anymore + (3) Phase 2 -- Merge: Coarsen the graph by aggregating the vertices which are assigned in the same community into one vertex + (4) Phase 3 -- Refinement: run the first phase again on each vertex to do some small adjustments for the resulting communities + + */ TYPEDEF TUPLE Cluster_Num; TYPEDEF TUPLE V_Delta_Q; @@ -317,4 +331,4 @@ CREATE QUERY algo_louvain_enhanced(STRING vertex_type, STRING edge_type, END; PRINT start [start.@cid]; PRINT "Community Detection Done"; -} \ No newline at end of file +} From 918fd31079941fd91b3a1509445e4b96af4fe618 Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Thu, 4 Aug 2022 23:41:23 -0400 Subject: [PATCH 75/93] Update algo_louvain.gsql --- .../db_scripts/queries/algo_louvain.gsql | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/algo_louvain.gsql b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/algo_louvain.gsql index fb3c772..dc39b14 100644 --- a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/algo_louvain.gsql +++ b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/algo_louvain.gsql @@ -13,11 +13,12 @@ CREATE QUERY algo_louvain(INT iter1 = 10, INT iter2 = 10, INT iter3 = 10, INT sp fComm, fDist: files to store community label and community distribution (1) Initialize: count edges and set a unique cluster ID for each vertex - (2) Phase 1 -- Move + (2) Phase 1 -- Move: incrementally calculates the modularity change of moving a vertex into every other community + and moves the vertex to the community with the highest modularity change (a) For each vertex, calculate the change in modularity FROM adding it to each of the nearby clusters (b) Add vertex to cluster with highest positive change in modularity (c) Repeat the above until no vertices change cluster anymore - (3) Phase 2 -- Merge + (3) Phase 2 -- Merge: Coarsen the graph by aggregating the vertices which are assigned in the same community into one vertex (a) Select the vertices with minimal internal id to represent the coarsened graph (b) Get @cweight from totalIncident (c) Calculate.num_patient incident from vertex to cluster in coarsened graph; change every interation From 09bf60a061d3ed9a17087a68344d53ff1b9c7c91 Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Thu, 4 Aug 2022 23:42:50 -0400 Subject: [PATCH 76/93] Update algo_page_rank.gsql --- .../db_scripts/queries/algo_page_rank.gsql | 20 ++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/algo_page_rank.gsql b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/algo_page_rank.gsql index 3726ebd..f50a0cd 100644 --- a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/algo_page_rank.gsql +++ b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/algo_page_rank.gsql @@ -1,12 +1,18 @@ CREATE QUERY algo_page_rank(FLOAT max_change = 0.001, INT max_iter = 25, FLOAT damping = 0.85, INT output_limit) FOR GRAPH MyGraph SYNTAX V2 { - # Compute the pageRank score for each vertex in the GRAPH -# In each iteration, compute a score for each vertex: -# score = (1-damping) + damping*sum(received scores FROM its neighbors). -# The pageRank algorithm stops when either of the following is true: -# a) it reaches max_iter iterations; -# b) the max score change for any vertex compared to the last iteration <= max_change. +/* + Compute the pageRank score for each vertex in the GRAPH + + No inputs + + From all Prescribers: + (1) In each iteration, compute a score for each vertex: + score = (1-damping) + damping*sum(received scores FROM its neighbors). + (2) The pageRank algorithm stops when either of the following is true: + a) it reaches max_iter iterations; + b) the max score change for any vertex compared to the last iteration <= max_change. +*/ TYPEDEF TUPLE Vertex_Score; HeapAccum(output_limit, score DESC) @@top_scores; @@ -34,4 +40,4 @@ CREATE QUERY algo_page_rank(FLOAT max_change = 0.001, INT max_iter = 25, PRINT @@top_scores; END; -} \ No newline at end of file +} From 4bca663226541c8dd2b206439418d7f748cc6450 Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Thu, 4 Aug 2022 23:50:37 -0400 Subject: [PATCH 77/93] Update conn_comp.gsql --- .../db_scripts/queries/conn_comp.gsql | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/conn_comp.gsql b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/conn_comp.gsql index aeb7a58..8d4f38b 100644 --- a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/conn_comp.gsql +++ b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/conn_comp.gsql @@ -1,6 +1,18 @@ CREATE QUERY conn_comp (STRING vertex_type, STRING edge_type, STRING rev_edge_type) FOR GRAPH MyGraph SYNTAX V2 { -# This query identifies the Connected Components (undirected edges) + /* + Identifies the Connected Components (undirected edges) + + Sample inputs: + vertex_type: + edge_type: + rev_edge_type: + + Start from given vertex_type: + (1) Initialize: Label each vertex with its own internal ID + (2) Propagate smaller internal IDs until no more ID changes can be Done + + */ MinAccum @cc_id = 0; //each vertex's tentative component id SumAccum @old_id = 0; @@ -16,7 +28,7 @@ CREATE QUERY conn_comp (STRING vertex_type, STRING edge_type, STRING rev_edge_ty x.@old_id = getvid(x) ; -# Propagate smaller internal IDs until no more ID changes can be DOne +# Propagate smaller internal IDs until no more ID changes can be Done WHILE (start.size()>0) DO start = SELECT t FROM start:s -((edge_type|rev_edge_type):e)- :t @@ -38,4 +50,4 @@ CREATE QUERY conn_comp (STRING vertex_type, STRING edge_type, STRING rev_edge_ty POST-ACCUM @@comp_sizes += (s.@cc_id -> 1); PRINT @@comp_sizes; PRINT start [start.@cc_id]; -} \ No newline at end of file +} From 489405f183f1211bbfde107a8ee796b72a29f070 Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Thu, 4 Aug 2022 23:50:41 -0400 Subject: [PATCH 78/93] Update conn_comp_enhanced.gsql --- .../queries/conn_comp_enhanced.gsql | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/conn_comp_enhanced.gsql b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/conn_comp_enhanced.gsql index 4db66fc..d235a00 100644 --- a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/conn_comp_enhanced.gsql +++ b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/conn_comp_enhanced.gsql @@ -1,6 +1,19 @@ CREATE QUERY conn_comp_enhanced (SET vertex_types, STRING vt2, STRING edge_type, STRING rev_edge_type, INT output_level) FOR GRAPH MyGraph SYNTAX V2 { -# This query identifies the Connected Components (undirected edges) +/* + Identifies the Connected Components (undirected edges) + + Sample inputs: + vertex_type: + vt2: + edge_type: + rev_edge_type: + output_level: + + Start from given vertex_type: + (1) Initialize: Label each vertex with its own internal ID + (2) Propagate smaller internal IDs until no more ID changes can be Done +*/ MinAccum @cc_id = 0; //each vertex's tentative component id SumAccum @old_id = 0; @@ -16,7 +29,7 @@ CREATE QUERY conn_comp_enhanced (SET vertex_types, STRING vt2, x.@old_id = getvid(x) ; -# Propagate smaller internal IDs until no more ID changes can be DOne +# Propagate smaller internal IDs until no more ID changes can be Done WHILE (start.size()>0) DO start = SELECT t FROM start:s -((edge_type|rev_edge_type):e)- :t @@ -41,4 +54,4 @@ CREATE QUERY conn_comp_enhanced (SET vertex_types, STRING vt2, IF output_level > 0 THEN PRINT start [start.@cc_id]; END; -} \ No newline at end of file +} From 91b985cb124cc933edc5b07bd667a1735584e344 Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Fri, 5 Aug 2022 01:22:24 -0400 Subject: [PATCH 79/93] Update conn_comp.gsql --- .../db_scripts/queries/conn_comp.gsql | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/conn_comp.gsql b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/conn_comp.gsql index 8d4f38b..99bd4fe 100644 --- a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/conn_comp.gsql +++ b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/conn_comp.gsql @@ -4,9 +4,9 @@ CREATE QUERY conn_comp (STRING vertex_type, STRING edge_type, STRING rev_edge_ty Identifies the Connected Components (undirected edges) Sample inputs: - vertex_type: - edge_type: - rev_edge_type: + vertex_type: claim + edge_type: associated + rev_edge_type: reverse_associated Start from given vertex_type: (1) Initialize: Label each vertex with its own internal ID From 6947d518d6fec7b85fdccd32c4065e0d12abfd28 Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Fri, 5 Aug 2022 01:23:21 -0400 Subject: [PATCH 80/93] Update conn_comp_enhanced.gsql --- .../db_scripts/queries/conn_comp_enhanced.gsql | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/conn_comp_enhanced.gsql b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/conn_comp_enhanced.gsql index d235a00..c348d95 100644 --- a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/conn_comp_enhanced.gsql +++ b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/conn_comp_enhanced.gsql @@ -4,11 +4,11 @@ CREATE QUERY conn_comp_enhanced (SET vertex_types, STRING vt2, Identifies the Connected Components (undirected edges) Sample inputs: - vertex_type: - vt2: - edge_type: - rev_edge_type: - output_level: + vertex_type: claim + vt2: N/A + edge_type: associated + rev_edge_type: reverse_associated + output_level: 1 Start from given vertex_type: (1) Initialize: Label each vertex with its own internal ID From ed79834ee162227a9fbcad4c55c5777000f68d48 Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Fri, 5 Aug 2022 01:27:09 -0400 Subject: [PATCH 81/93] Update get_community.gsql --- .../db_scripts/queries/get_community.gsql | 26 ++++++++++++++----- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/get_community.gsql b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/get_community.gsql index a9147c7..568e5aa 100644 --- a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/get_community.gsql +++ b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/get_community.gsql @@ -1,10 +1,22 @@ CREATE QUERY get_community(STRING prescriber_Id, INT community_Id) FOR GRAPH MyGraph SYNTAX V2 { - /* This query finds the vertices and interconnecting edges associated either with the given - * prescriber_Id, or if the prescriber_Id is not provided (empty string), then - * for the given community_Id. - * NOTE: This algorithm requires that the community_Id attribute has been set, - * by running the alg_louvain query, + /* + + This query finds the vertices and interconnecting edges associated either with the given + prescriber_Id, or if the prescriber_Id is not provided (empty string), then + for the given community_Id. + NOTE: This algorithm requires that the community_Id attribute has been set, + by running the alg_louvain query, + + Sample inputs: + prescriber_Id: pre78 | pre30 + community_Id: 10 + + Start from all Prescribers: + (1) Select prescribers where the id equals the given prescriber_id and set comm_Id + (2) Get all the vertices and intercomnnecting edges with the give comm_Id + */ + SetAccum @@edge_list; INT comm_Id; @@ -20,11 +32,11 @@ CREATE QUERY get_community(STRING prescriber_Id, INT community_Id) FOR GRAPH MyG END; PRINT comm_Id; - // Get all the vertices and intercomnnecting edges with the give comm_Id + // Get all the vertices and intercomnnecting edges with the given comm_Id comm_vertices = SELECT s FROM start:s -(referral>:e)- :t WHERE s.communityId == comm_Id AND t.communityId == comm_Id ACCUM @@edge_list += e; PRINT comm_vertices[comm_vertices.Prescriber_id]; PRINT @@edge_list; -} \ No newline at end of file +} From c2042b9a960f7498c5b4625c6a8688c4423cd1d6 Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Fri, 5 Aug 2022 01:29:17 -0400 Subject: [PATCH 82/93] Update get_community.gsql --- .../db_scripts/queries/get_community.gsql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/get_community.gsql b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/get_community.gsql index 568e5aa..61b1217 100644 --- a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/get_community.gsql +++ b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/get_community.gsql @@ -1,7 +1,7 @@ CREATE QUERY get_community(STRING prescriber_Id, INT community_Id) FOR GRAPH MyGraph SYNTAX V2 { /* - This query finds the vertices and interconnecting edges associated either with the given + Finds the vertices and interconnecting edges associated either with the given prescriber_Id, or if the prescriber_Id is not provided (empty string), then for the given community_Id. NOTE: This algorithm requires that the community_Id attribute has been set, From 53650211ed8eb998e154ad8fe8f8870d10b5966e Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Fri, 5 Aug 2022 01:31:47 -0400 Subject: [PATCH 83/93] Update insert_all_referrals.gsql --- .../db_scripts/queries/insert_all_referrals.gsql | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/insert_all_referrals.gsql b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/insert_all_referrals.gsql index 0e637df..546c78e 100644 --- a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/insert_all_referrals.gsql +++ b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/insert_all_referrals.gsql @@ -1,4 +1,14 @@ CREATE QUERY insert_all_referrals () FOR GRAPH MyGraph SYNTAX V2 { + +/* + + Inserts and returns the total referrals across prescribers + + No inputs + + From all Prescriber vertices: + (1) Select all prescribers, insert referrals, and count the number of referrals +*/ SumAccum @@num_insertions; @@ -8,4 +18,4 @@ CREATE QUERY insert_all_referrals () FOR GRAPH MyGraph SYNTAX V2 { ACCUM @@num_insertions += insert_referrals(s); PRINT @@num_insertions; -} \ No newline at end of file +} From a7f4e7819643a41b9136f986b9399f1f4ab45173 Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Fri, 5 Aug 2022 01:35:00 -0400 Subject: [PATCH 84/93] Update insert_referrals.gsql --- .../db_scripts/queries/insert_referrals.gsql | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/insert_referrals.gsql b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/insert_referrals.gsql index 4b4b93b..00f5e29 100644 --- a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/insert_referrals.gsql +++ b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/insert_referrals.gsql @@ -1,6 +1,22 @@ CREATE QUERY insert_referrals(VERTEX input_prescriber) FOR GRAPH MyGraph RETURNS (INT) SYNTAX V2 { + /* + + Inserts and returns number of referral insertions + + Sample input: + input_prescriber: pre38 + + Start from the input_prescriber: + (1) Select claims from input_prescriber and mark as visited + (2) Select patients from the claims and update date lists + (3) Select other claims from patients in (2) that are unvisited + (4) Select claims from (3) and insert into referral + + + */ + OrAccum @visited, @is_referred_claim; ListAccum @date_list; SumAccum @@num_insertions; @@ -27,4 +43,4 @@ CREATE QUERY insert_referrals(VERTEX input_prescriber) @@num_insertions += 1; print start_set; RETURN @@num_insertions; -} \ No newline at end of file +} From fbe587276ae58cade2b87c5886c6c26e2702107c Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Fri, 5 Aug 2022 01:36:44 -0400 Subject: [PATCH 85/93] Update insert_referrals.gsql --- .../db_scripts/queries/insert_referrals.gsql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/insert_referrals.gsql b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/insert_referrals.gsql index 00f5e29..8b0b269 100644 --- a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/insert_referrals.gsql +++ b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/insert_referrals.gsql @@ -3,7 +3,7 @@ CREATE QUERY insert_referrals(VERTEX input_prescriber) /* - Inserts and returns number of referral insertions + Inserts and returns number of referral insertions from unvisited claims Sample input: input_prescriber: pre38 From cb550294c3a329fd18380b2f536e015f7ae2ac3a Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Fri, 5 Aug 2022 01:39:01 -0400 Subject: [PATCH 86/93] Update kcore_decomp.gsql --- .../db_scripts/queries/kcore_decomp.gsql | 23 ++++++++++++++----- 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/kcore_decomp.gsql b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/kcore_decomp.gsql index ccd396b..0dbaf15 100644 --- a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/kcore_decomp.gsql +++ b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/kcore_decomp.gsql @@ -1,12 +1,23 @@ CREATE QUERY kcore_decomp(STRING vertex_type, STRING edge_type, INT k_min, INT k_max = -1, BOOL show_membership=false, BOOL show_shells=true) FOR GRAPH MyGraph SYNTAX V2 { -/* Outputs the k-core vertex membership for each value of k from k_min to k_max. - * By definition, for k=0, the vertex set = the entire graph. - * As k increases, V(k) is a subset of V(k-1). - * If k_max < 0, then the query proceeds until it reaches the maximal k-core. - * Calls kcore_sub(). +/* + Outputs the k-core vertex membership for each value of k from k_min to k_max. + By definition, for k=0, the vertex set = the entire graph. + As k increases, V(k) is a subset of V(k-1). + If k_max < 0, then the query proceeds until it reaches the maximal k-core. + Calls kcore_sub(). + + Sample inputs: + vertex_type: claim + edge_type: associated + k_min: 0 + + (1) print just the size of each k-core + (2) print the membership of each k-core + */ + MapAccum> @@core_list_map; // Map SetAccum @@k_core_vertices; // vertex set for k_max ListAccum @@induced_edges; // optional output @@ -47,4 +58,4 @@ CREATE QUERY kcore_decomp(STRING vertex_type, STRING edge_type, INT k_min, END; END; END; -} \ No newline at end of file +} From 2d14fac6ed73e0be2e2d95e76f5f8ae03b0a4bce Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Fri, 5 Aug 2022 01:41:40 -0400 Subject: [PATCH 87/93] Update kcore_max.gsql --- .../db_scripts/queries/kcore_max.gsql | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/kcore_max.gsql b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/kcore_max.gsql index 8a166a0..2a44323 100644 --- a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/kcore_max.gsql +++ b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/kcore_max.gsql @@ -1,8 +1,19 @@ CREATE QUERY kcore_max (STRING vertex_type, STRING edge_type, BOOL induced_edges, INT verbosity) FOR GRAPH MyGraph SYNTAX V2 { -/* An implementation of Algorithm 2 in - * Scalable K-Core Decomposition for Static Graphs Using a Dynamic Graph Data Structure, - * Tripathy et al., IEEE Big Data 2018. +/* + An implementation of Algorithm 2 in Scalable K-Core Decomposition for + Static Graphs Using a Dynamic Graph Data Structure, Tripathy et al., + IEEE Big Data 2018. + + Sample inputs: + vertex_type: claim + edge_type: associated + induced_egdes: False + verbosity: 3 + + (1) Get vertex core numbers from kcore_sub query + (2) Print results for k, kcore_vertices, etc + */ MapAccum> @@core_list_map; // Map @@ -25,4 +36,4 @@ CREATE QUERY kcore_max (STRING vertex_type, STRING edge_type, BOOL induced_edges ACCUM @@induced_edges += e; PRINT @@induced_edges; END; -} \ No newline at end of file +} From 2676794578a210ab105a9e13ca6daefc8a83de0b Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Fri, 5 Aug 2022 01:44:46 -0400 Subject: [PATCH 88/93] Update kcore_sub.gsql --- .../db_scripts/queries/kcore_sub.gsql | 25 +++++++++++++++---- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/kcore_sub.gsql b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/kcore_sub.gsql index 4388cfb..6979e26 100644 --- a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/kcore_sub.gsql +++ b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/kcore_sub.gsql @@ -1,8 +1,23 @@ CREATE QUERY kcore_sub (STRING vertexType, STRING edgeType, INT verbosity) FOR GRAPH MyGraph RETURNS (MapAccum>) SYNTAX V2 { -/* An implementation of Algorithm 2 in - * Scalable K-Core Decomposition for Static Graphs Using a Dynamic Graph Data Structure, - * Tripathy et al., IEEE Big Data 2018. - * Returns a map > where are those who are in that k-core but not (k+1)-core +/* + An implementation of Algorithm 2 in Scalable K-Core Decomposition for Static Graphs + Using a Dynamic Graph Data Structure, Tripathy et al., IEEE Big Data 2018. + Returns a map > where are those who are in + that k-core but not (k+1)-core + + Sample inputs: + vertexType: claim + edgeType: associated + verbosity: 5 + + Start from vertexType: + (1) Initialize @deg with vertexType's outdegree + (2) Find vertices whose degree < or = k and mark those vertices individually + (3) Set the core level of those vertices and collect those vertices + (4) Reduce degree of vertices + (5) Print @@core_list_map, list of vertices sorted by increasing core level + + */ @@ -59,4 +74,4 @@ CREATE QUERY kcore_sub (STRING vertexType, STRING edgeType, INT verbosity) FOR G @@core_list_map += (k -> @@Q); END; RETURN @@core_list_map; -} \ No newline at end of file +} From 3d3069a81f9f159ac9f2ff71804975c0405b0590 Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Fri, 5 Aug 2022 01:50:28 -0400 Subject: [PATCH 89/93] Update scc.gsql --- .../db_scripts/queries/scc.gsql | 29 +++++++++++++------ 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/scc.gsql b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/scc.gsql index bad4d8e..9e1c19f 100644 --- a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/scc.gsql +++ b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/scc.gsql @@ -1,14 +1,25 @@ CREATE QUERY scc (INT iter = 500, INT iter_wcc = 5, INT top_k_dist = 10) FOR GRAPH MyGraph SYNTAX V2 { -/* This query detects strongly connected components based on the following papers: - * https://www.sandia.gov/~apinar/papers/irreg00.pdf - * https://www.sciencedirect.com/science/article/pii/S0743731505000535 - * https://stanford-ppl.github.io/website/papers/sc13-hong.pdf +/* + Detects strongly connected components based on the following papers: + https://www.sandia.gov/~apinar/papers/irreg00.pdf + https://www.sciencedirect.com/science/article/pii/S0743731505000535 + https://stanford-ppl.github.io/website/papers/sc13-hong.pdf - * iter: number of iteration of the algorithm - * iter_wcc: find weakly connected components for the active vertices in this iteration, since the largest sccs are already found after several iterations; usually a small number(3 to 10) - * top_k_dist: top k result in scc distribution + Inputs: + iter: number of iteration of the algorithm + iter_wcc: find weakly connected components for the active vertices in this iteration, since the largest sccs are already found after several iterations; usually a small number(3 to 10) + top_k_dist: top k result in scc distribution - * DISTRIBUTED QUERY mode for this query is supported from TG 2.4. + *DISTRIBUTED QUERY mode for this query is supported from TG 2.4. + + Select all Prescribers: + (1) Initialize accumulators + (2) Trim size 1 SCC + (3) Get WCC + (4) Mark forward set + (5) Mark backward set + (5) Return results of SCC detection + */ TYPEDEF TUPLE cluster_num; MapAccum @@cluster_size_map, @@cluster_dist_map; @@ -166,4 +177,4 @@ CREATE QUERY scc (INT iter = 500, INT iter_wcc = 5, INT top_k_dist = 10) FOR GRA PRINT v_all [v_all.@cid]; -} \ No newline at end of file +} From 682910818c44db5e68c5a904d903557582c2760c Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Fri, 5 Aug 2022 01:51:40 -0400 Subject: [PATCH 90/93] Update scc.gsql --- .../db_scripts/queries/scc.gsql | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/scc.gsql b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/scc.gsql index 9e1c19f..bc115af 100644 --- a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/scc.gsql +++ b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/scc.gsql @@ -7,7 +7,9 @@ CREATE QUERY scc (INT iter = 500, INT iter_wcc = 5, INT top_k_dist = 10) FOR GRA Inputs: iter: number of iteration of the algorithm - iter_wcc: find weakly connected components for the active vertices in this iteration, since the largest sccs are already found after several iterations; usually a small number(3 to 10) + iter_wcc: find weakly connected components for the active vertices in + this iteration, since the largest sccs are already found after + several iterations; usually a small number(3 to 10) top_k_dist: top k result in scc distribution *DISTRIBUTED QUERY mode for this query is supported from TG 2.4. From b5e8bf532b72a48b5f589838b711765dab37f1ba Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Fri, 5 Aug 2022 01:52:04 -0400 Subject: [PATCH 91/93] Update scc_enhanced.gsql --- .../db_scripts/queries/scc_enhanced.gsql | 31 +++++++++++++------ 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/scc_enhanced.gsql b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/scc_enhanced.gsql index b169ba1..c6b2eac 100644 --- a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/scc_enhanced.gsql +++ b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/scc_enhanced.gsql @@ -1,16 +1,29 @@ CREATE QUERY scc_enhanced (INT iter = 500, INT iter_wcc = 5, INT top_k_dist = 10) FOR GRAPH MyGraph SYNTAX V2 { -/* This query detects strongly connected components based on the following papers: - * https://www.sandia.gov/~apinar/papers/irreg00.pdf - * https://www.sciencedirect.com/science/article/pii/S0743731505000535 - * https://stanford-ppl.github.io/website/papers/sc13-hong.pdf +/* + Detects strongly connected components based on the following papers: + https://www.sandia.gov/~apinar/papers/irreg00.pdf + https://www.sciencedirect.com/science/article/pii/S0743731505000535 + https://stanford-ppl.github.io/website/papers/sc13-hong.pdf - * iter: number of iteration of the algorithm - * iter_wcc: find weakly connected components for the active vertices in this iteration, since the largest sccs are already found after several iterations; usually a small number(3 to 10) - * top_k_dist: top k result in scc distribution + Inputs: + iter: number of iteration of the algorithm + iter_wcc: find weakly connected components for the active vertices in this + iteration, since the largest sccs are already found after several + iterations; usually a small number(3 to 10) + top_k_dist: top k result in scc distribution - * DISTRIBUTED QUERY mode for this query is supported from TG 2.4. + * DISTRIBUTED QUERY mode for this query is supported from TG 2.4. + + Select all Prescribers: + (1) Initialize accumulators + (2) Trim size 1 SCC + (3) Get WCC + (4) Mark forward set + (5) Mark backward set + (5) Return results of SCC detection */ + TYPEDEF TUPLE Cluster_Num; MapAccum @@cluster_size_map, @@cluster_dist_map; HeapAccum(top_k_dist, csize DESC) @@cluster_dist_heap; @@ -171,4 +184,4 @@ CREATE QUERY scc_enhanced (INT iter = 500, INT iter_wcc = 5, INT top_k_dist = 10 PRINT v_all [v_all.@cid]; -} \ No newline at end of file +} From 59dc61e9ddfe7787c85386c445549c0059060dfc Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Fri, 5 Aug 2022 01:53:34 -0400 Subject: [PATCH 92/93] Update select_subgraph.gsql --- .../db_scripts/queries/select_subgraph.gsql | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/select_subgraph.gsql b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/select_subgraph.gsql index 9c29bba..52146c4 100644 --- a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/select_subgraph.gsql +++ b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/select_subgraph.gsql @@ -1,9 +1,20 @@ CREATE QUERY select_subgraph(STRING vertex_type, STRING edge_type) FOR GRAPH MyGraph SYNTAX V2 { - +/* + Returns edges and vertices of subgraph + + Sample inputs: + vertex_type: claim + edge_type: associated + + From given vertex_type: + (1) Select edge list and target set from given edge_type + +*/ + ListAccum @@edge_list; source_set = {vertex_type}; target_set = SELECT t FROM source_set:s -(edge_type:e)- :t ACCUM @@edge_list += e; PRINT source_set, target_set, @@edge_list; -} \ No newline at end of file +} From 7eda1463b5b458652911eaabca82728c82497016 Mon Sep 17 00:00:00 2001 From: kristinezhengx <97307010+kristinezhengx@users.noreply.github.com> Date: Fri, 5 Aug 2022 02:00:55 -0400 Subject: [PATCH 93/93] Update README.gsql --- .../db_scripts/queries/README.gsql | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/README.gsql b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/README.gsql index f824b72..8c72195 100644 --- a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/README.gsql +++ b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/README.gsql @@ -4,6 +4,43 @@ CREATE QUERY README() FOR GRAPH MyGraph { THE REFERRAL EDGE IS USED IN OTHER QUERIES. */ + + STRING name = "Graph-Analytics-Community-Detection-Algorithms"; + STRING graph_description = "Find communities of a specific type in your network " + + (Louvain Method, Connected Components, K-Core Decomposition, strongly connected components)"; + + STRING query_order = "1. insert_all_referrals"; + STRING Print_community = "Returns edges of community given prescriber"; + STRING algo_louvain = "Louvain Method with Parallelism and Refinement https://arxiv.org/pdf/1304.4453 " + + "The minimum label heuristics are implemented: https://doi.org/10.1016/j.parco.2015.03.003"; + STRING algo_louvain_enhanced = "Louvain Method with Parallelism and Refinement https://arxiv.org/pdf/1304.4453 " + + "The minimum label heuristics are implemented: https://doi.org/10.1016/j.parco.2015.03.003"; + STRING algo_page_rank = "Compute the pageRank score for each vertex in the GRAPH"; + STRING conn_comp = "Identifies the Connected Components (undirected edges)"; + STRING conn_comp_enhanced = "Identifies the Connected Components (undirected edges)"; + STRING get_community = "Finds the vertices and interconnecting edges associated either with the given prescriber_Id, " + + "or if the prescriber_Id is not provided (empty string), then for the given community_Id."; + STRING insert_all_referrals = "Inserts and returns the total referrals across prescribers"; + STRING insert_referrals = "Inserts and returns number of referral insertions from unvisited claims"; + STRING kcore_decomp = "Outputs the k-core vertex membership for each value of k from k_min to k_max. "; + STRING kcore_max = "An implementation of Algorithm 2 in Scalable K-Core Decomposition for Static Graphs " + + "Using a Dynamic Graph Data Structure, Tripathy et al., IEEE Big Data 2018."; + STRING kcore_sub = "An implementation of Algorithm 2 in Scalable K-Core Decomposition for Static Graphs " + + "Using a Dynamic Graph Data Structure, Tripathy et al., IEEE Big Data 2018. "; + STRING scc = "Detects strongly connected components based on the following papers: " + + "https://www.sandia.gov/~apinar/papers/irreg00.pdf, " + + "https://www.sciencedirect.com/science/article/pii/S0743731505000535, " + + "https://stanford-ppl.github.io/website/papers/sc13-hong.pdf"; + STRING scc_enhanced = "Detects strongly connected components based on the following papers: " + + "https://www.sandia.gov/~apinar/papers/irreg00.pdf, " + + "https://www.sciencedirect.com/science/article/pii/S0743731505000535, " + + "https://stanford-ppl.github.io/website/papers/sc13-hong.pdf"; + STRING select_subgraph = "Returns edges and vertices of subgraph"; + + PRINT name, graph_description, query_order, Print_community, algo_louvain, algo_louvain_enhanced, algo_page_rank; + PRINT conn_comp, conn_comp_enhanced, get_community, insert_all_referrals, insert_referrals; + PRINT kcore_decomp, kcore_max, kcore_sub, scc, scc_enhanced, select_subgraph; + print "I read this!"; }