diff --git a/.DS_Store b/.DS_Store index 651b7e0..6924e73 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/Print_community.gsql b/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/Print_community.gsql deleted file mode 100644 index 94cf234..0000000 --- a/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/Print_community.gsql +++ /dev/null @@ -1,20 +0,0 @@ -CREATE OR REPLACE QUERY print_community(vertex inputPrescriber) FOR GRAPH MyGraph { - /* Write query logic here */ - //PRINT "Print_community works!"; - - ListAccum @@edgeList; - - SumAccum @@cid; - - Start={inputPrescriber}; - Start=Select s from Start:s post-accum @@cid += s.communityId; - - Start = {Prescriber.*}; - - Start = select s from Start:s-(referral>:e)-:t - where s.communityId == @@cid and s.communityId == t.communityId - accum @@edgeList += e; - - print Start; - print @@edgeList; -} \ No newline at end of file diff --git a/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/README.gsql b/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/README.gsql new file mode 100644 index 0000000..e374870 --- /dev/null +++ b/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/README.gsql @@ -0,0 +1,7 @@ +CREATE QUERY README() FOR GRAPH MyGraph { + PRINT "Healthcare - Referral networks, Hub (PageRank), & Community Detection"; + PRINT "Investigating healthcare referral networks"; + + PRINT "The queries tg_louvain.gsql and infer_all_referrals.gsql MUST be"; + PRINT "run in that order. tg_page_rank.gsql must also be run."; +} \ No newline at end of file diff --git a/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/algo_louvain.gsql b/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/algo_louvain.gsql deleted file mode 100644 index 346d211..0000000 --- a/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/algo_louvain.gsql +++ /dev/null @@ -1,345 +0,0 @@ -CREATE OR REPLACE QUERY algo_louvain(INT iter1 = 10, INT iter2 = 10, - INT iter3 = 10, INT split = 10, INT output_level = 0) FOR GRAPH MyGraph { - -/* - * Louvain Method with Parallelism and Refinement - * https://arxiv.org/pdf/1304.4453 - * The minimum label heuristics are implemented: https://doi.org/10.1016/j.parco.2015.03.003 - * iter: There are three phases in the algorithm -- move, merge and refine. - * Their max number of iterations are set by iter1, iter2, iter3 respectively. - * split: To save memory, split number is 10 by default. When the split number - * is larger, the query is closer to sequential Louvain Method, which is slower. - * When the split number is 1, the query is parallel, but requires more memory. - * output_level: 0, only list number; 1, also list members - * fComm, fDist: files to store community label and community distribution - */ - - TYPEDEF TUPLE cluster_num; - TYPEDEF TUPLE v_delta_Q; - HeapAccum(1, delta_Q DESC, cid ASC) @largest_delta_Q; // if delta_Q is the same, select the one with mininal vid - MapAccum @@tot_incident_cluster; // sun of weight incident to clusters - MapAccum @@cluster_sizes; // size of a cluster - MapAccum @weight_to_cluster; // weight from one vertex incident to that cluster - - SumAccum @@total_weight; // total weight of all edges - - SumAccum @weight; // total weight incident to this vertex - - SumAccum @cweight; // total weight incident to this aggregate vertex - - SumAccum @uid; // which vertex it belongs to - - SumAccum @cid; // which cluster it belongs to - - SumAccum @vid; // internal id - - SumAccum @delta_Q; // contribution to the modularity - - SumAccum @@modularity; - - SumAccum @@modularity2; - - MapAccum> @@weight_to_cluster_map; // calculate edges between communities - - MapAccum> @@move_comm; // map of communities that changed its community id - - MapAccum> @@represent_map; - - SetAccum @@represent_set; - - MapAccum @@vertex_map; - - MapAccum> @@edge_map; - - HeapAccum(100, csize ASC) @@cluster_dist; - - MapAccum @@cluster_map; - - MapAccum> @@cluster_members; - - FLOAT last_modularity = 0; - - FLOAT last_modularity2 = 0; - - INT iteration; - - INT Iter1; - - FLOAT epsilon = 0.0001; - - INT iteration2; - - INT partitions; - - INT loop; - - INT debug = 0; // debug: 0, no modularity info; 1, show debug log; 2, modularity for each iteration - - - - partitions = split; - - CASE WHEN split < 1 THEN - partitions = 1; - END; - - - -// Initialize: count edges and set a unique cluster ID for each vertex - Start (ANY) = {Prescriber.*}; - - S (ANY) = SELECT s - FROM Start:s -((referral>|| t.@cid - ACCUM s.@largest_delta_Q += v_delta_Q(t, t.@cid, e.num_patient - 2 * s.@weight * s.@weight/ @@total_weight) - // weight_to_cluster is just e.num_patient - POST-ACCUM INT best_cluster = s.@largest_delta_Q.top().cid, - IF s.@largest_delta_Q.size() > 0 and s.@largest_delta_Q.top().delta_Q > 0 and s.@cid != best_cluster THEN - s.@cid = best_cluster - END, - s.@largest_delta_Q.clear(); - - S = SELECT s - FROM Start:s-((referral>| 0, "[redrain]//move", iteration, @@modularity); - - - -// Phase 1 -- Move - -// For each vertex, calculate the change in modularity FROM adding it to each of the nearby clusters - -// Add vertex to cluster with highest positive change in modularity - -// Repeat the above until no vertices change cluster anymore - - S = SELECT s - FROM Start:s - ACCUM @@tot_incident_cluster += (s.@cid -> s.@weight); - - iteration = 1; - Iter1 = iter1 - 1; - - WHILE (iteration < 2 OR @@modularity - last_modularity > epsilon) LIMIT Iter1 DO - iteration = iteration + 1; - loop = 0; - WHILE (loop < partitions) DO - S = SELECT s - FROM Start:s -((referral>| epsilon // s is not a singlet - OR abs(t.@weight - @@tot_incident_cluster.get(t.@cid)) > epsilon ) // or t is not a singlet - OR (abs(s.@weight - @@tot_incident_cluster.get(s.@cid)) < epsilon // s is a singlet - AND abs(t.@weight - @@tot_incident_cluster.get(t.@cid)) < epsilon // t is also a singlet - AND s.@cid > t.@cid) ) // consider only when target label is smaller - ACCUM s.@weight_to_cluster += (t.@cid -> e.num_patient) - POST-ACCUM INT best_cluster = s.@cid, - FLOAT max_delta_Q = 0.0, - FLOAT delta_Q_new = 0.0, - FOREACH (cluster, weightToC) IN s.@weight_to_cluster DO //would be better if this can be distributed - FLOAT incident = @@tot_incident_cluster.get(cluster), - delta_Q_new = weightToC - 2 * incident * s.@weight/ @@total_weight, - IF delta_Q_new > max_delta_Q OR (abs(delta_Q_new - max_delta_Q) < epsilon AND cluster < best_cluster) THEN // when delta_Q_new is equal to max_delta_Q, and the cluster label is smaller, also change - max_delta_Q = delta_Q_new, - best_cluster = cluster - END - END, - IF s.@cid != best_cluster THEN - @@tot_incident_cluster += (s.@cid -> (-1 * s.@weight)), - @@tot_incident_cluster += (best_cluster -> s.@weight), - s.@cid = best_cluster - END, - s.@weight_to_cluster.clear(); - loop = loop + 1; - END; - last_modularity = @@modularity; - @@modularity = 0; - T1 = SELECT s - FROM Prescriber:s-((referral>| 0, "[redrain]//move", iteration, @@modularity); - END; - -// Phase 2 -- Merge - iteration2 = 0; - - WHILE (iteration2 < 2 OR @@modularity2 - last_modularity2 > epsilon) LIMIT iter2 DO - iteration2 = iteration2 + 1; - Start = SELECT s - FROM Start:s - ACCUM s.@uid = s.@cid; - // Select the vertices with minimal internal id to represent the coarsened graph - Start = SELECT s - FROM Start:s - ACCUM @@represent_map += (s.@cid -> s); - - FOREACH (key, value) IN @@represent_map DO - @@represent_set += value; - END; - represent = {@@represent_set}; - @@represent_map.clear(); - @@represent_set.clear(); - log(debug > 0, "[redrain]//2_merge", represent.size()); //@@cluster_sizes.size()); - - // Get @cweight from totalIncident - represent = SELECT s - FROM represent:s - ACCUM s.@cweight = @@tot_incident_cluster.get(s.@uid), - @@cluster_sizes += (s.@cid -> 1); - - log(debug > 1, "[redrain]//2_merge", @@weight_to_cluster_map.size()); - iteration = 0; - last_modularity = 0; - @@modularity = 0; - - WHILE (iteration < 2 OR @@modularity - last_modularity > epsilon) limit iter1 DO - iteration = iteration + 1; - - // Calculate.num_patient incident from vertex to cluster in coarsened graph; change every interation - S = SELECT s - FROM Start:s -((referral>| 0 AND @@tot_incident_cluster.get(t.@cid) > 0 //@@tot_incident_cluster keeps changing, can be 0 - ACCUM @@weight_to_cluster_map += (s.@uid -> (t.@cid -> e.num_patient)); // from s, incident to some clusters. Not consider the same cluster - represent = SELECT s - FROM represent:s - POST-ACCUM INT best_cluster = s.@cid, - FLOAT max_delta_Q = 0.0, - FLOAT delta_Q_new = 0.0, - FOREACH (cluster, weightToC) IN @@weight_to_cluster_map.get(s.@uid) DO - FLOAT incident = @@tot_incident_cluster.get(cluster), - IF @@cluster_sizes.get(s.@cid) == 1 AND @@cluster_sizes.get(cluster) == 1 AND s.@cid < cluster THEN - CONTINUE - END, - delta_Q_new = weightToC - 2 * incident * s.@cweight/ @@total_weight, //total weight should be the same - IF delta_Q_new > max_delta_Q OR abs(delta_Q_new - max_delta_Q) < epsilon AND cluster < best_cluster THEN // new cluster is smaller then the current best cluster - max_delta_Q = delta_Q_new, - best_cluster = cluster - END - END, - IF s.@cid != best_cluster THEN - @@tot_incident_cluster += (s.@cid -> (-1 * s.@cweight)), - @@tot_incident_cluster += (best_cluster -> s.@cweight), - @@move_comm += (s.@uid -> best_cluster), - @@cluster_sizes += (s.@cid -> -1), - @@cluster_sizes += (best_cluster -> 1), - s.@cid = best_cluster - END; - log(debug > 1, "[redrain]//2_merge", @@weight_to_cluster_map.size()); - @@weight_to_cluster_map.clear(); - - log(debug > 1, "[redrain]//2_move:", @@move_comm.size()); - // move nodes - S = SELECT s - FROM Start:s - WHERE @@move_comm.containsKey(s.@uid) - POST-ACCUM FOREACH v IN @@move_comm.get(s.@uid) DO - s.@cid = v - END; - @@move_comm.clear(); - - last_modularity = @@modularity; - @@modularity = 0; - - S = SELECT s - FROM Start:s-((referral>| 0, "[redrain]//2_move", iteration, @@modularity); - END; - - S = SELECT s - FROM represent:s - ACCUM s.@cweight = 0; - @@cluster_sizes.clear(); - - last_modularity2 = @@modularity2; - @@modularity2 = @@modularity; - PRINT iteration2 AS Phase2Iter, @@modularity2; - log(debug > 0, "[redrain]//2_merge", iteration2, @@modularity2); - - END; - - -// Phase 3 -- Refinement - iteration = 0; - @@modularity = 0; - WHILE (iteration < 2 OR @@modularity - last_modularity > epsilon) LIMIT iter3 DO - iteration = iteration + 1; - S = SELECT s - FROM Start:s -((referral>| epsilon OR abs(t.@weight - @@tot_incident_cluster.get(t.@cid)) > epsilon OR (abs(s.@weight - @@tot_incident_cluster.get(s.@cid)) < epsilon AND abs(t.@weight - @@tot_incident_cluster.get(t.@cid)) < epsilon AND s.@cid > t.@cid) // at least one cluster not only itself, or use smaller label - ACCUM s.@weight_to_cluster += (t.@cid -> e.num_patient) - POST-ACCUM - INT best_cluster = s.@cid, - FLOAT max_delta_Q = 0.0, - FLOAT delta_Q_new = 0.0, - FOREACH (cluster, weightToC) IN s.@weight_to_cluster DO //would be better if this can be distributed - FLOAT incident = @@tot_incident_cluster.get(cluster), - delta_Q_new = weightToC - 2 * incident * s.@weight/ @@total_weight, - IF delta_Q_new > max_delta_Q OR (abs(delta_Q_new - max_delta_Q) < epsilon AND cluster < best_cluster) THEN // when delta_Q_new is equal to max_delta_Q, and the cluster label is smaller, also change - max_delta_Q = delta_Q_new, - best_cluster = cluster - END - END, - IF s.@cid != best_cluster THEN - @@tot_incident_cluster += (s.@cid -> (-1 * s.@weight)), - @@tot_incident_cluster += (best_cluster -> s.@weight), - s.@cid = best_cluster - END, - s.@weight_to_cluster.clear(); - last_modularity = @@modularity; - @@modularity = 0; - T1 = SELECT s - FROM Start:s-((referral>| 0, "[redrain]//refine", iteration, @@modularity); - END; - - - Print Start [Start.@cid]; - Start = {ANY}; - Start = SELECT s FROM Start:s - POST-ACCUM @@cluster_sizes += (s.@cid -> 1),s.communityId=s.@cid - ; - log(TRUE, @@cluster_sizes.size()); - IF output_level ==0 THEN - FOREACH (cluster, csize) IN @@cluster_sizes DO - @@cluster_map += (csize -> 1); - END; - FOREACH (csize, number) IN @@cluster_map DO - @@cluster_dist += cluster_num(csize, number); - END; - PRINT @@cluster_dist; - ELSE - FOREACH (cluster, csize) IN @@cluster_sizes DO - @@cluster_members += (csize -> cluster); - END; - PRINT @@cluster_members; - PRINT "Community Detection Done"; - END; -} \ No newline at end of file diff --git a/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/algo_page_rank.gsql b/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/algo_page_rank.gsql deleted file mode 100644 index a711d0f..0000000 --- a/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/algo_page_rank.gsql +++ /dev/null @@ -1,37 +0,0 @@ -CREATE OR REPLACE QUERY algo_page_rank(FLOAT maxChange, INT maxIter, FLOAT damping, INT outputLimit) FOR GRAPH MyGraph { - -/* Compute the pageRank score for each vertex in the GRAPH - In each iteration, compute a score for each vertex: - score = (1-damping) + damping*sum(received scores FROM its neighbors). - The pageRank algorithm stops when either of the following is true: - a) it reaches maxIter iterations; - b) the max score change for any vertex compared to the last iteration <= maxChange. -*/ - - TYPEDEF TUPLE vertexScore; - HeapAccum(outputLimit, score DESC) @@top_Scores; - MaxAccum @@max_Diff = 999999; // max score change in an iteration - SumAccum @received_score = 0; // sum of scores each vertex receives FROM neighbors - SumAccum @score = 1; // Initial score for every vertex is 1. - SetAccum @@edge_Set; // list of all edges, if display is needed - - Start = {Prescriber.*}; // Start with all vertices of specified type(s) - V (ANY) = {}; - WHILE @@max_Diff > maxChange LIMIT maxIter DO - @@max_Diff = 0; - V = SELECT s - FROM Start:s -(referral>:e)- :t - ACCUM t.@received_score += s.@score/(s.outdegree("referral")) - POST-ACCUM s.@score = (1.0-damping) + damping * s.@received_score, - s.@received_score = 0, - @@max_Diff += abs(s.@score - s.@score'); - END; // END WHILE loop - - - IF outputLimit > 0 THEN - V = SELECT s FROM Start:s - POST-ACCUM @@top_Scores += vertexScore(s, s.@score),s.pageRank=s.@score; - PRINT @@top_Scores; - END; - -} \ No newline at end of file diff --git a/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/ex1_common_patients.gsql b/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/ex1_common_patients.gsql deleted file mode 100644 index c519255..0000000 --- a/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/ex1_common_patients.gsql +++ /dev/null @@ -1,39 +0,0 @@ -CREATE OR REPLACE QUERY ex1_common_patients(vertex Prescriber1, vertex Prescriber2) FOR GRAPH MyGraph { - - OrAccum @visited; - SetAccum @@edge_Set; - Pre1 = {Prescriber1}; - Pre2 = {Prescriber2}; - // Step 1 – Start graph Traversal from first prescriber to find all associated claims. Use visited flag to remember claims visited. - claims1 = SELECT t - FROM Pre1:s -(:e)- Patient:t - ACCUM t.@visited += TRUE; - // Step 3 Start graph traversal from second prescriber to find all claims - claims2 = SELECT t - FROM Pre2:s -(:e)- Patient:t - WHERE t.@visited == TRUE; - PRINT common_patients; - - // Step 5 – From common patients find all claims that have been visited in earlier steps. Collect the edges so they can be printed. - claims = SELECT t - FROM common_patients:s -(:e)- Prescriber:t - ACCUM @@edge_Set += e; - PRINT pres; - PRINT @@edge_Set; -} \ No newline at end of file diff --git a/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/ex2_create_referral_edge.gsql b/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/ex2_create_referral_edge.gsql deleted file mode 100644 index d85c83e..0000000 --- a/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/ex2_create_referral_edge.gsql +++ /dev/null @@ -1,27 +0,0 @@ -CREATE OR REPLACE QUERY ex2_create_referral_edge(VERTEX inputPrescriber) FOR GRAPH MyGraph { - OrAccum @visited, @is_Referred_Claim; - - ListAccum @date_List; - - start_set = {inputPrescriber}; - - claims = SELECT t FROM start_set:s-(:e)-:t - ACCUM t.@date_List += s.rx_fill_date; - - claims = SELECT t FROM patients:s-(:e)-:t - POST-ACCUM INSERT INTO referral VALUES(inputPrescriber, t, 1); - print start_set; - -} \ No newline at end of file diff --git a/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/ex2_main_query.gsql b/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/ex2_main_query.gsql deleted file mode 100644 index 2e54eab..0000000 --- a/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/ex2_main_query.gsql +++ /dev/null @@ -1,6 +0,0 @@ -CREATE OR REPLACE QUERY ex2_main_query(/* Parameters here */) FOR GRAPH MyGraph { - - all_prescribers = select s from Prescriber:s accum ex2_create_referral_edge(s); - - print all_prescribers; -} \ No newline at end of file diff --git a/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/get_claims.gsql b/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/get_claims.gsql deleted file mode 100644 index 643166d..0000000 --- a/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/get_claims.gsql +++ /dev/null @@ -1,11 +0,0 @@ -CREATE OR REPLACE QUERY get_claims(vertex inputPrescriber) FOR GRAPH MyGraph { - - ListAccum @@list; - - start_set = {inputPrescriber}; - - claims = SELECT t FROM start_set:s-( input_prescriber) FOR GRAPH MyGraph { +/* + Get the Claims of a given Prescriber + + Sample inputs: + input_prescriber: pre6 | pre30 | pre13 + + Starting from an "input_prescriber", + (1) Grab all the prescriber's claims + Prescriber -( @@submitted_by_list; + + start = {input_prescriber}; + + claims = SELECT t // select claims connected to the input prescriber + FROM start:s-( prescriber1, + VERTEX prescriber2) FOR GRAPH MyGraph { +/* + Get the Patients that two Prescribers have in common + + Sample inputs: + prescriber1: pre6 | pre30 | pre13 + prescriber2: pre6 | pre30 | pre13 + + Starting from "prescriber1", + (1) Mark the connected Claim vertices as visited. + (2) Mark Patient vertices connected to the Claim vertices as visited. + Prescriber -()- Patient + Starting from "prescriber2", + (1) Mark the connected Claim vertices as visited. + (2) Find all Patients connected to Claims that has been visited. + Prescriber -()- Patient +*/ + + OrAccum @visited; + SetAccum @@edge_set; + + pre1 = {prescriber1}; + pre2 = {prescriber2}; + + /* Step 1 – Start graph Traversal from first prescriber to find all + associated claims. Use visited flag to remember claims visited. */ + claims1 = SELECT t + FROM pre1:s -(:e)- Patient:t + ACCUM t.@visited += TRUE; + + // Step 3 - Start graph traversal from second prescriber to find all claims + claims2 = SELECT t + FROM pre2:s -(:e)- Patient:t + WHERE t.@visited == TRUE; + PRINT common_patients; + + /* Step 5 – From common patients find all claims that have been visited + in earlier steps. Collect the edges so they can be printed. */ + claims = SELECT t + FROM common_patients:s -(:e)- Prescriber:t + ACCUM @@edge_set += e; + + PRINT pres; + PRINT @@edge_set; +} \ No newline at end of file diff --git a/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/get_joint_prescribers.gsql b/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/get_joint_prescribers.gsql new file mode 100644 index 0000000..e38b9b3 --- /dev/null +++ b/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/get_joint_prescribers.gsql @@ -0,0 +1,43 @@ +CREATE QUERY get_joint_prescribers(VERTEX input_prescriber) + FOR GRAPH MyGraph { +/* + Get Prescribers who have treated the same patients of a given Prescriber + + Sample inputs: + input_prescriber: pre6 | pre30 | pre13 + + Starting with an "input_prescriber", + (1) Get the connected claims, accumulate the edges, and mark the claims as visited + (2) Get the patients connected to claims and accumulate the edges + (3) Get the claims connected to the patients that have not been visited and accumulate the edges + (4) Get the prescribers connected to the claims and accumulate the edges + (5) Display the prescribers and edges + Prescriber -()- Patient -()- Prescriber +*/ + + ListAccum @@edge_list; // list will have all edges traversed + OrAccum @visited; + + start = {input_prescriber}; + + claims = SELECT t // select connected claims + FROM start:s-(:e)-:t + ACCUM @@edge_list +=e; + + claims = SELECT t + FROM patients:s-(:e)-:t + ACCUM @@edge_list +=e; + + PRINT prescribers, @@edge_list; +} \ No newline at end of file diff --git a/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/get_k_hop_neighbors.gsql b/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/get_k_hop_neighbors.gsql index 175e2c9..6340390 100644 --- a/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/get_k_hop_neighbors.gsql +++ b/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/get_k_hop_neighbors.gsql @@ -1,16 +1,32 @@ -CREATE OR REPLACE QUERY get_k_hop_neighbor(int k, vertex input) FOR GRAPH MyGraph { - - OrAccum @visited; - ListAccum @@edgeList; - - start = {input}; - - WHILE start.size() > 0 limit k DO - start = SELECT t from start-(:e)-:t - WHERE t.@visited == false - ACCUM @@edgeList += e - POST-ACCUM t.@visited = true; - END; - - print @@edgeList; +CREATE QUERY get_k_hop_neighbors(INT k, VERTEX input) FOR GRAPH MyGraph { +/* + Get all the vertices within k hops of a source vertex + + Sample inputs: + k: any number > 0 + input: (Claim, 9921) | (SubSpecialty, Cardiology) | (Prescriber, pre78) + + Starting with the "input", + (1) Traverse to all the vertices connected which was not visited + (2) Accumulate the vertices and edges + (3) Mark the vertices as visited + (4) Repeat the traversal k times +*/ + + OrAccum @visited; + ListAccum @@vertex_list; + ListAccum @@edge_list; + + start = {input}; + + WHILE start.size() > 0 LIMIT k DO /* stops either when there are no + vertices or when reaching k */ + start = SELECT t + FROM start-(:e)-:t // visit all connected vertices + WHERE t.@visited == FALSE // vertex must be new + ACCUM @@vertex_list += t, @@edge_list += e // add to global lists + POST-ACCUM t.@visited = TRUE; // mark vertices visited + END; + + PRINT @@vertex_list, @@edge_list; } \ No newline at end of file diff --git a/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/get_patients.gsql b/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/get_patients.gsql deleted file mode 100644 index a048c95..0000000 --- a/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/get_patients.gsql +++ /dev/null @@ -1,14 +0,0 @@ -CREATE OR REPLACE QUERY get_patients(vertex inputPrescriber) FOR GRAPH MyGraph { - - ListAccum @@list; - - start_set = {inputPrescriber}; - - claims = SELECT t FROM start_set:s-(:e)-:t - ACCUM @@list +=e; - - print claims, @@list; -} \ No newline at end of file diff --git a/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/get_patients_of_prescriber.gsql b/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/get_patients_of_prescriber.gsql new file mode 100644 index 0000000..a6a7477 --- /dev/null +++ b/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/get_patients_of_prescriber.gsql @@ -0,0 +1,29 @@ +CREATE QUERY get_patients_of_prescriber(VERTEX input_prescriber) + FOR GRAPH MyGraph { +/* + Get the Patients of a given Prescriber + + Sample inputs: + input_prescriber: pre6 | pre30 | pre13 + + Starting with the "input_prescriber", + (1) Find all the connected Claim vertices + (2) Find all the connected Patient vertices + (3) Print the Claim vertices and all the edges traversed + Prescriber -()- Patient +*/ + + ListAccum @@edge_list; + + start = {input_prescriber}; + + claims = SELECT t // select connected claims + FROM start:s-(:e)-:t + ACCUM @@edge_list += e; + + PRINT patients, @@edge_list; +} \ No newline at end of file diff --git a/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/get_prescribers.gsql b/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/get_prescribers.gsql deleted file mode 100644 index 47a8847..0000000 --- a/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/get_prescribers.gsql +++ /dev/null @@ -1,22 +0,0 @@ -CREATE OR REPLACE QUERY get_prescribers(vertex inputPrescriber) FOR GRAPH MyGraph { - ListAccum @@list; - OrAccum @visited; - - start_set = {inputPrescriber}; - - claims = SELECT t FROM start_set:s-(:e)-:t - ACCUM @@list +=e; - - claims = SELECT t FROM patients:s-(:e)-:t - ACCUM @@list +=e; - - print prescribers, @@list; -} \ No newline at end of file diff --git a/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/get_referral_community.gsql b/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/get_referral_community.gsql new file mode 100644 index 0000000..4abf375 --- /dev/null +++ b/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/get_referral_community.gsql @@ -0,0 +1,31 @@ +CREATE QUERY get_referral_community(VERTEX input_prescriber) + FOR GRAPH MyGraph { +/* + Get the Prescribers in the same referral community as that of a + given Prescriber + + Sample inputs: + input_prescriber: pre6 | pre30 | pre13 + + Starting with the Prescriber vertices, + (1) Find all vertices with the same community ID as the input_prescriber + (2) Print the Prescriber vertices in the community and edges +*/ + + ListAccum @@edge_list; + SumAccum @@cid; + + start = {input_prescriber}; + start = SELECT s + FROM start:s + POST-ACCUM @@cid += s.communityId; + + start = {Prescriber.*}; + + start = SELECT s + FROM start:s-(referral>:e)-:t + WHERE s.communityId == @@cid AND s.communityId == t.communityId + ACCUM @@edge_list += e; + + PRINT start, @@edge_list; +} \ No newline at end of file diff --git a/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/infer_all_referrals.gsql b/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/infer_all_referrals.gsql new file mode 100644 index 0000000..26c9370 --- /dev/null +++ b/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/infer_all_referrals.gsql @@ -0,0 +1,16 @@ +CREATE QUERY infer_all_referrals() FOR GRAPH MyGraph { +/* + Create referral edges from any Prescriber to other Providers when they + treat the same Patient in sequence + + Starting with the Prescriber vertices, + (1) Run infer_referrals + (2) Print the Prescriber vertices +*/ + + all_prescribers = SELECT s + FROM Prescriber:s + ACCUM infer_referrals(s); // run infer_referrals query + + PRINT all_prescribers; +} \ No newline at end of file diff --git a/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/infer_referrals.gsql b/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/infer_referrals.gsql new file mode 100644 index 0000000..cdc205e --- /dev/null +++ b/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/infer_referrals.gsql @@ -0,0 +1,50 @@ +CREATE QUERY infer_referrals(VERTEX input_prescriber) + FOR GRAPH MyGraph { +/* + Create referral edges from a given Prescriber to other Providers when + they treat the same Patient in sequence + + Sample inputs: + input_prescriber: pre6 | pre30 | pre13 + + Starting with an "input_prescriber", + (1) Find all the connected Claim vertices and mark them as visited + (2) Find all the connected Patient vertices to the Claim vertices + (3) Find all the connected Claim vertices that are have not been visited + and the claims has been filled at most 30 days before the input + prescriber's claim + Prescriber -()- Patient + -( @visited, @is_referred_claim; + ListAccum @date_list; + + start = {input_prescriber}; + + claims = SELECT t // get connected claims + FROM start:s-(:e)-:t + ACCUM t.@date_list += s.rx_fill_date; // flow dates to patients + + referred_claims = SELECT t + FROM patients:s-(:e)-:t + POST-ACCUM INSERT INTO referral VALUES(input_prescriber, t, 1); /* add + a referral edge between the prescribers and the input prescriber */ + PRINT start; +} \ No newline at end of file diff --git a/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/tg_louvain.gsql b/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/tg_louvain.gsql new file mode 100644 index 0000000..386b655 --- /dev/null +++ b/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/tg_louvain.gsql @@ -0,0 +1,228 @@ +CREATE QUERY tg_louvain(SET v_type, SET e_type, STRING wt_attr = "weight", INT max_iter = 10, + STRING result_attr = "cid", STRING file_path = "", BOOL print_info = FALSE) SYNTAX V1 { + + /* + louvain community detection algorithm + add keyword DISTRIBUTED for cluster environment + + Sample inputs: + v_type: Prescriber + e_type: referral, reverse_referral + wt_attr: num_patients + result_attr: communityId + + Parameters: + v_type: vertex types to traverse + e_type: edge types to traverse + wt_attr: attribute name for edge weights use empty string is graph is unweighted + wt_attr type is hardcoded to FLOAT INT or DOUBLE can be supported by changing all `e.getAttr(wt_attr, "FLOAT")` + to `e.getAttr(wt_attr, "INT")` or `e.getAttr(wt_attr, "DOUBLE")` + * note: when there is a weight attribute missmatch, there may not be an explicit error message + all print results showing 0 data are present is an indication that there might be a weight attribute missmatch + + max_iter: maximum iteration of louvain optimization + result_attr: attribute name to assign community id results to; use empty string to skip + file_path: file path to write CSV output to; use empty string to skip + print_info: print louvain execution info + */ + + TYPEDEF TUPLE move; + SumAccum @sum_ac; #sum of the degrees of all the vertices in community C of the vertex + ListAccum @cc_list; #the community center + SumAccum @sum_weight; # total weight incident to this vertex + SumAccum @sum_cc_weight; # total weight incident to the cc vertex + MapAccum> @A_map; #A[c]: sum of the edge weights for the edges in community c + MaxAccum @max_best_move; # highest dQ, highest -Outdegree, highest cc + ListAccum @cm_list; #community member list + SumAccum @@sum_m; # total edge weight + SumAccum @sum_outdegree; # helper variable for outdegree calculation + SumAccum @@sum_cc_change; + MapAccum> @@community_map; + MapAccum> @@community_size_count; + FILE f(file_path); + + // initialize + Start = {v_type}; + Start = SELECT s + FROM Start:s -(e_type:e)- :t + ACCUM + @@sum_m += e.getAttr(wt_attr, "INT")*0.5, + s.@sum_weight += e.getAttr(wt_attr, "INT")*1.0, + s.@sum_cc_weight += e.getAttr(wt_attr, "INT")*1.0, + s.@sum_outdegree += 1 + // mark @cc only for vertices with more than 1 neighbors + // and only the marked vertices will participate in the actual louvain algorithm + // the unmorked vertices will be resolved by the vertex following heuristic + POST-ACCUM + IF s.@sum_outdegree > 1 THEN + s.@cc_list += s + END; + IF print_info THEN + PRINT Start.size() AS AllVertexCount; + END; + + // special @cc update in the first iteration + Start = SELECT t + FROM Start:s -(e_type:e)- :t + WHERE s.@sum_outdegree > 1 AND t.@sum_outdegree > 1 + ACCUM + t.@max_best_move += move(e.getAttr(wt_attr, "INT")*1.0 + @@sum_m*t.@sum_weight * + (t.@sum_weight - s.@sum_weight), -s.@sum_cc_weight, s.@cc_list.get(0)) + POST-ACCUM + IF t.@max_best_move.deltaQ > 0 THEN + IF -t.@max_best_move.weight < t.@sum_cc_weight THEN + t.@cc_list.clear(), + t.@cc_list += t.@max_best_move.cc, + t.@sum_cc_weight = -t.@max_best_move.weight, + @@sum_cc_change += 1 + ELSE + IF -t.@max_best_move.weight == t.@sum_cc_weight AND getvid(t) < getvid(t.@max_best_move.cc) THEN + t.@cc_list.clear(), + t.@cc_list += t.@max_best_move.cc, + t.@sum_cc_weight = -t.@max_best_move.weight, + @@sum_cc_change += 1 + END + END + END; + IF print_info THEN + PRINT @@sum_cc_change AS InitChangeCount; + END; + + // main loop + WHILE @@sum_cc_change > 0 LIMIT max_iter DO + // initialize for iteration + @@sum_cc_change = 0; + Start = SELECT s + FROM Start:s + WHERE s.@sum_outdegree > 1 + POST-ACCUM + s.@sum_ac = 0, + s.@cm_list.clear(), + s.@A_map.clear(); + + Start = SELECT s + FROM Start:s + ACCUM + FOREACH v IN s.@cc_list DO + CASE WHEN getvid(v) != -1 THEN + v.@cm_list += s + END + END; + + Start = SELECT s + FROM Start:s -(e_type:e)- :t + WHERE t.@sum_outdegree > 1 + ACCUM + s.@A_map += (t.@cc_list.get(0) -> e.getAttr(wt_attr, "INT")*1.0); + + Start = SELECT s + FROM Start:s + ACCUM + FOREACH v IN s.@cc_list DO + CASE WHEN getvid(v) != -1 THEN + v.@sum_ac += s.@sum_weight + END + END; + + Start = SELECT s + FROM Start:s + ACCUM + FOREACH v IN s.@cm_list DO + CASE WHEN getvid(v) != -1 THEN + v.@sum_ac = s.@sum_ac + END + END; + + // compute @max_dQ + Start = SELECT s + FROM Start:s -(e_type:e)- :t + WHERE t.@sum_outdegree > 1 + ACCUM + INT A_s = 0, + IF s.@A_map.containsKey(s) THEN + A_s = s.@A_map.get(s) + END, + s.@max_best_move += move(s.@A_map.get(t.@cc_list.get(0)) - A_s + + 1/@@sum_m*s.@sum_weight*(s.@sum_ac-t.@sum_ac), -t.@sum_cc_weight, t.@cc_list.get(0)) + POST-ACCUM + IF s.@max_best_move.deltaQ > 0 THEN + IF -s.@max_best_move.weight < s.@sum_cc_weight THEN // smallest best_move weight < current weight + s.@cc_list.clear(), + s.@cc_list += s.@max_best_move.cc, + s.@sum_cc_weight = -s.@max_best_move.weight, + @@sum_cc_change += 1 + ELSE + IF -s.@max_best_move.weight == s.@sum_cc_weight AND getvid(s.@cc_list.get(0)) < getvid(s.@max_best_move.cc) THEN + s.@cc_list.clear(), + s.@cc_list += s.@max_best_move.cc, + s.@sum_cc_weight = -s.@max_best_move.weight, + @@sum_cc_change += 1 + END + END + END; + IF print_info THEN + PRINT @@sum_cc_change AS IterChangeCount; + END; + END; + + // process node with outdegree=1 + // follow the vertex to its neighbor's community + // if the neighbor also have outdegree=1, mark the two vertices as one community + Start = {v_type}; + Start = SELECT s + FROM Start:s -(e_type:e)- :t + WHERE s.@sum_outdegree == 1 AND t.@sum_outdegree != 1 + ACCUM + s.@cc_list += t.@cc_list.get(0); + IF print_info THEN + PRINT Start.size() AS VertexFollowedToCommunity; + END; + + Start = {v_type}; + Start = SELECT s + FROM Start:s -(e_type:e)- :t + WHERE s.@sum_outdegree == 1 AND t.@sum_outdegree == 1 + ACCUM + IF getvid(s) <= getvid(t) THEN + s.@cc_list += s + ELSE + s.@cc_list += t + END; + IF print_info THEN + PRINT Start.size() AS VertexFollowedToVertex; + END; + + // process node with outdegree=0 + // assign them to communities containing only itself + Start = {v_type}; + Start = SELECT s + FROM Start:s + WHERE s.@sum_outdegree == 0 + ACCUM + s.@cc_list += s; + IF print_info THEN + PRINT Start.size() AS VertexAssignedToItself; + END; + + // save result + Start = {v_type}; + Start = SELECT s + FROM Start:s + POST-ACCUM + IF result_attr != "" THEN + s.setAttr(result_attr, getvid(s.@cc_list.get(0))) + END, + IF file_path != "" THEN + f.println(s, getvid(s.@cc_list.get(0))) + END; + + // print result satistic + IF print_info THEN + Start = SELECT s + FROM Start:s + WHERE s.@cc_list.size() > 0 + POST-ACCUM + @@community_map += (getvid(s.@cc_list.get(0)) -> 1); + PRINT @@community_map.size() AS FinalCommunityCount; + END; +} \ No newline at end of file diff --git a/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/tg_page_rank.gsql b/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/tg_page_rank.gsql new file mode 100644 index 0000000..837779b --- /dev/null +++ b/Healthcare-Referral-networks-Hub-PageRank-Community-Detection/db_scripts/queries/tg_page_rank.gsql @@ -0,0 +1,80 @@ +CREATE QUERY tg_pagerank (STRING v_type, STRING e_type, + FLOAT max_change=0.001, INT max_iter=25, FLOAT damping=0.85, INT top_k = 100, + BOOL print_accum = TRUE, STRING result_attr = "", STRING file_path = "", + BOOL display_edges = FALSE) SYNTAX V1 { + +/* + Compute the pageRank score for each vertex in the GRAPH + + Sample inputs: + v_type: Prescriber + e_type: referral + result_attr: pageRank + + In each iteration, compute a score for each vertex: + score = (1-damping) + damping*sum(received scores FROM its neighbors). + The pageRank algorithm stops when either of the following is true: + a) it reaches max_iter iterations; + b) the max score change for any vertex compared to the last iteration <= max_change. + v_type: vertex types to traverse print_accum: print JSON output + e_type: edge types to traverse result_attr: INT attr to store results to + max_iter; max #iterations file_path: file to write CSV output to + top_k: #top scores to output display_edges: output edges for visualization + max_change: max allowed change between iterations to achieve convergence + damping: importance of traversal vs. random teleport + + This query supports only taking in a single edge for the time being (8/13/2020). +*/ +TYPEDEF TUPLE Vertex_Score; +HeapAccum(top_k, score DESC) @@top_scores_heap; +MaxAccum @@max_diff = 9999; # max score change in an iteration +SumAccum @sum_recvd_score = 0; # sum of scores each vertex receives FROM neighbors +SumAccum @sum_score = 1; # initial score for every vertex is 1. +SetAccum @@edge_set; # list of all edges, if display is needed +FILE f (file_path); + +# PageRank iterations +Start = {v_type}; # Start with all vertices of specified type(s) +WHILE @@max_diff > max_change + LIMIT max_iter DO + @@max_diff = 0; + V = SELECT s + FROM Start:s -(e_type:e)- v_type:t + ACCUM + t.@sum_recvd_score += s.@sum_score/(s.outdegree(e_type)) + POST-ACCUM + s.@sum_score = (1.0-damping) + damping * s.@sum_recvd_score, + s.@sum_recvd_score = 0, + @@max_diff += abs(s.@sum_score - s.@sum_score'); +END; # END WHILE loop + +# Output +IF file_path != "" THEN + f.println("Vertex_ID", "PageRank"); +END; +V = SELECT s + FROM Start:s + POST-ACCUM + IF result_attr != "" THEN + s.setAttr(result_attr, s.@sum_score) + END, + + IF file_path != "" THEN + f.println(s, s.@sum_score) + END, + + IF print_accum THEN + @@top_scores_heap += Vertex_Score(s, s.@sum_score) + END; + +IF print_accum THEN + PRINT @@top_scores_heap; + IF display_edges THEN + PRINT Start[Start.@sum_score]; + Start = SELECT s + FROM Start:s -(e_type:e)- v_type:t + ACCUM @@edge_set += e; + PRINT @@edge_set; + END; +END; +} \ No newline at end of file