diff --git a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/A_README.gsql b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/A_README.gsql deleted file mode 100644 index 00f65e1..0000000 --- a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/A_README.gsql +++ /dev/null @@ -1,9 +0,0 @@ -CREATE QUERY A_README() FOR GRAPH MyGraph { -/* - IMPORANT : PLEASE INSTALL AND RUN the insert_all_referrals QUERY FIRST. - - THE REFERRAL EDGE IS USED IN OTHER QUERIES. -*/ - - print "I read this!"; -} \ No newline at end of file diff --git a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/Print_community.gsql b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/Print_community.gsql index 1bc96d4..b88b196 100644 --- a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/Print_community.gsql +++ b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/Print_community.gsql @@ -1,6 +1,18 @@ CREATE QUERY print_community(vertex input_prescriber) FOR GRAPH MyGraph SYNTAX V2 { - /* Write query logic here */ - //PRINT "Print_community works!"; + /* + + Returns edges of community given prescriber + + Sample input: + input_prescriber: pre14 | pre25 + + Using all Prescribers: + (1) Select presribers where their community id + matches the input_prescriber and the + prescriber they refer + (2) Return referral edges of the community + + */ ListAccum @@edge_list; @@ -17,4 +29,4 @@ CREATE QUERY print_community(vertex input_prescriber) FOR GRAPH MyGr print start; print @@edge_list; -} \ No newline at end of file +} diff --git a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/README.gsql b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/README.gsql new file mode 100644 index 0000000..8c72195 --- /dev/null +++ b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/README.gsql @@ -0,0 +1,46 @@ +CREATE QUERY README() FOR GRAPH MyGraph { +/* + IMPORANT : PLEASE INSTALL AND RUN the insert_all_referrals QUERY FIRST. + + THE REFERRAL EDGE IS USED IN OTHER QUERIES. +*/ + + STRING name = "Graph-Analytics-Community-Detection-Algorithms"; + STRING graph_description = "Find communities of a specific type in your network " + + (Louvain Method, Connected Components, K-Core Decomposition, strongly connected components)"; + + STRING query_order = "1. insert_all_referrals"; + + STRING Print_community = "Returns edges of community given prescriber"; + STRING algo_louvain = "Louvain Method with Parallelism and Refinement https://arxiv.org/pdf/1304.4453 " + + "The minimum label heuristics are implemented: https://doi.org/10.1016/j.parco.2015.03.003"; + STRING algo_louvain_enhanced = "Louvain Method with Parallelism and Refinement https://arxiv.org/pdf/1304.4453 " + + "The minimum label heuristics are implemented: https://doi.org/10.1016/j.parco.2015.03.003"; + STRING algo_page_rank = "Compute the pageRank score for each vertex in the GRAPH"; + STRING conn_comp = "Identifies the Connected Components (undirected edges)"; + STRING conn_comp_enhanced = "Identifies the Connected Components (undirected edges)"; + STRING get_community = "Finds the vertices and interconnecting edges associated either with the given prescriber_Id, " + + "or if the prescriber_Id is not provided (empty string), then for the given community_Id."; + STRING insert_all_referrals = "Inserts and returns the total referrals across prescribers"; + STRING insert_referrals = "Inserts and returns number of referral insertions from unvisited claims"; + STRING kcore_decomp = "Outputs the k-core vertex membership for each value of k from k_min to k_max. "; + STRING kcore_max = "An implementation of Algorithm 2 in Scalable K-Core Decomposition for Static Graphs " + + "Using a Dynamic Graph Data Structure, Tripathy et al., IEEE Big Data 2018."; + STRING kcore_sub = "An implementation of Algorithm 2 in Scalable K-Core Decomposition for Static Graphs " + + "Using a Dynamic Graph Data Structure, Tripathy et al., IEEE Big Data 2018. "; + STRING scc = "Detects strongly connected components based on the following papers: " + + "https://www.sandia.gov/~apinar/papers/irreg00.pdf, " + + "https://www.sciencedirect.com/science/article/pii/S0743731505000535, " + + "https://stanford-ppl.github.io/website/papers/sc13-hong.pdf"; + STRING scc_enhanced = "Detects strongly connected components based on the following papers: " + + "https://www.sandia.gov/~apinar/papers/irreg00.pdf, " + + "https://www.sciencedirect.com/science/article/pii/S0743731505000535, " + + "https://stanford-ppl.github.io/website/papers/sc13-hong.pdf"; + STRING select_subgraph = "Returns edges and vertices of subgraph"; + + PRINT name, graph_description, query_order, Print_community, algo_louvain, algo_louvain_enhanced, algo_page_rank; + PRINT conn_comp, conn_comp_enhanced, get_community, insert_all_referrals, insert_referrals; + PRINT kcore_decomp, kcore_max, kcore_sub, scc, scc_enhanced, select_subgraph; + + print "I read this!"; +} diff --git a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/algo_louvain.gsql b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/algo_louvain.gsql index b373b5d..dc39b14 100644 --- a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/algo_louvain.gsql +++ b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/algo_louvain.gsql @@ -2,13 +2,28 @@ CREATE QUERY algo_louvain(INT iter1 = 10, INT iter2 = 10, INT iter3 = 10, INT sp Bool sort_by_pre_ID, Bool sort_by_comm_ID) FOR GRAPH MyGraph SYNTAX V2 { /* -* Louvain Method with Parallelism and Refinement -* https://arxiv.org/pdf/1304.4453 -* The minimum label heuristics are implemented: https://doi.org/10.1016/j.parco.2015.03.003 -* iter: There are three phases in the algorithm -- move, merge and refine. Their max number of iterations are set by iter1, iter2, iter3 respectively. -* split: To save memory, split number is 10 by default. When the split number is larger, the query is closer to sequential Louvain Method, which is slower. When the split number is 1, the query is parallel, but requires more memory. -* output_level: 0, only list number; 1, also list members -* fComm, fDist: files to store community label and community distribution + Louvain Method with Parallelism and Refinement + https://arxiv.org/pdf/1304.4453 + The minimum label heuristics are implemented: https://doi.org/10.1016/j.parco.2015.03.003 + + Inputs: + iter: There are three phases in the algorithm -- move, merge and refine. Their max number of iterations are set by iter1, iter2, iter3 respectively. + split: To save memory, split number is 10 by default. When the split number is larger, the query is closer to sequential Louvain Method, which is slower. When the split number is 1, the query is parallel, but requires more memory. + output_level: 0, only list number; 1, also list members + fComm, fDist: files to store community label and community distribution + + (1) Initialize: count edges and set a unique cluster ID for each vertex + (2) Phase 1 -- Move: incrementally calculates the modularity change of moving a vertex into every other community + and moves the vertex to the community with the highest modularity change + (a) For each vertex, calculate the change in modularity FROM adding it to each of the nearby clusters + (b) Add vertex to cluster with highest positive change in modularity + (c) Repeat the above until no vertices change cluster anymore + (3) Phase 2 -- Merge: Coarsen the graph by aggregating the vertices which are assigned in the same community into one vertex + (a) Select the vertices with minimal internal id to represent the coarsened graph + (b) Get @cweight from totalIncident + (c) Calculate.num_patient incident from vertex to cluster in coarsened graph; change every interation + (4) Phase 3 -- Refinement: run the first phase again on each vertex to do some small adjustments for the resulting communities + */ TYPEDEF TUPLE Cluster_Num; TYPEDEF TUPLE V_Delta_Q; @@ -242,7 +257,6 @@ CREATE QUERY algo_louvain(INT iter1 = 10, INT iter2 = 10, INT iter3 = 10, INT sp log(debug > 0, "[redrain]#2_merge", iteration2, @@modularity2); END; // outer WHILE - # Phase 3 -- Refinement iteration = 0; @@modularity = 0; @@ -317,4 +331,4 @@ CREATE QUERY algo_louvain(INT iter1 = 10, INT iter2 = 10, INT iter3 = 10, INT sp END; PRINT start [start.communityId]; PRINT "Community Detection Done"; -} \ No newline at end of file +} diff --git a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/algo_louvain_enhanced.gsql b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/algo_louvain_enhanced.gsql index dcb0cfc..a822637 100644 --- a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/algo_louvain_enhanced.gsql +++ b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/algo_louvain_enhanced.gsql @@ -4,13 +4,27 @@ CREATE QUERY algo_louvain_enhanced(STRING vertex_type, STRING edge_type, FOR GRAPH MyGraph SYNTAX V2 { /* -* Louvain Method with Parallelism and Refinement -* https://arxiv.org/pdf/1304.4453 -* The minimum label heuristics are implemented: https://doi.org/10.1016/j.parco.2015.03.003 -* iter: There are three phases in the algorithm -- move, merge and refine. Their max number of iterations are set by iter1, iter2, iter3 respectively. -* split: To save memory, split number is 10 by default. When the split number is larger, the query is closer to sequential Louvain Method, which is slower. When the split number is 1, the query is parallel, but requires more memory. -* output_level: 0, only list number; 1, also list members -* fComm, fDist: files to store community label and community distribution + Louvain Method with Parallelism and Refinement + https://arxiv.org/pdf/1304.4453 + The minimum label heuristics are implemented: https://doi.org/10.1016/j.parco.2015.03.003 + + Inputs: + iter: There are three phases in the algorithm -- move, merge and refine. Their max number of iterations are set by iter1, iter2, iter3 respectively. + split: To save memory, split number is 10 by default. When the split number is larger, the query is closer to sequential Louvain Method, which is slower. When the split number is 1, the query is parallel, but requires more memory. + output_level: 0, only list number; 1, also list members + fComm, fDist: files to store community label and community distribution + + + (1) Initialize: count edges and set a unique cluster ID for each vertex + (2) Phase 1 -- Move: incrementally calculates the modularity change of moving a vertex into every other community + and moves the vertex to the community with the highest modularity change + (a) For each vertex, calculate the change in modularity FROM adding it to each of the nearby clusters + (b) Add vertex to cluster with highest positive change in modularity + (c) Repeat the above until no vertices change cluster anymore + (3) Phase 2 -- Merge: Coarsen the graph by aggregating the vertices which are assigned in the same community into one vertex + (4) Phase 3 -- Refinement: run the first phase again on each vertex to do some small adjustments for the resulting communities + + */ TYPEDEF TUPLE Cluster_Num; TYPEDEF TUPLE V_Delta_Q; @@ -317,4 +331,4 @@ CREATE QUERY algo_louvain_enhanced(STRING vertex_type, STRING edge_type, END; PRINT start [start.@cid]; PRINT "Community Detection Done"; -} \ No newline at end of file +} diff --git a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/algo_page_rank.gsql b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/algo_page_rank.gsql index 3726ebd..f50a0cd 100644 --- a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/algo_page_rank.gsql +++ b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/algo_page_rank.gsql @@ -1,12 +1,18 @@ CREATE QUERY algo_page_rank(FLOAT max_change = 0.001, INT max_iter = 25, FLOAT damping = 0.85, INT output_limit) FOR GRAPH MyGraph SYNTAX V2 { - # Compute the pageRank score for each vertex in the GRAPH -# In each iteration, compute a score for each vertex: -# score = (1-damping) + damping*sum(received scores FROM its neighbors). -# The pageRank algorithm stops when either of the following is true: -# a) it reaches max_iter iterations; -# b) the max score change for any vertex compared to the last iteration <= max_change. +/* + Compute the pageRank score for each vertex in the GRAPH + + No inputs + + From all Prescribers: + (1) In each iteration, compute a score for each vertex: + score = (1-damping) + damping*sum(received scores FROM its neighbors). + (2) The pageRank algorithm stops when either of the following is true: + a) it reaches max_iter iterations; + b) the max score change for any vertex compared to the last iteration <= max_change. +*/ TYPEDEF TUPLE Vertex_Score; HeapAccum(output_limit, score DESC) @@top_scores; @@ -34,4 +40,4 @@ CREATE QUERY algo_page_rank(FLOAT max_change = 0.001, INT max_iter = 25, PRINT @@top_scores; END; -} \ No newline at end of file +} diff --git a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/conn_comp.gsql b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/conn_comp.gsql index aeb7a58..99bd4fe 100644 --- a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/conn_comp.gsql +++ b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/conn_comp.gsql @@ -1,6 +1,18 @@ CREATE QUERY conn_comp (STRING vertex_type, STRING edge_type, STRING rev_edge_type) FOR GRAPH MyGraph SYNTAX V2 { -# This query identifies the Connected Components (undirected edges) + /* + Identifies the Connected Components (undirected edges) + + Sample inputs: + vertex_type: claim + edge_type: associated + rev_edge_type: reverse_associated + + Start from given vertex_type: + (1) Initialize: Label each vertex with its own internal ID + (2) Propagate smaller internal IDs until no more ID changes can be Done + + */ MinAccum @cc_id = 0; //each vertex's tentative component id SumAccum @old_id = 0; @@ -16,7 +28,7 @@ CREATE QUERY conn_comp (STRING vertex_type, STRING edge_type, STRING rev_edge_ty x.@old_id = getvid(x) ; -# Propagate smaller internal IDs until no more ID changes can be DOne +# Propagate smaller internal IDs until no more ID changes can be Done WHILE (start.size()>0) DO start = SELECT t FROM start:s -((edge_type|rev_edge_type):e)- :t @@ -38,4 +50,4 @@ CREATE QUERY conn_comp (STRING vertex_type, STRING edge_type, STRING rev_edge_ty POST-ACCUM @@comp_sizes += (s.@cc_id -> 1); PRINT @@comp_sizes; PRINT start [start.@cc_id]; -} \ No newline at end of file +} diff --git a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/conn_comp_enhanced.gsql b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/conn_comp_enhanced.gsql index 4db66fc..c348d95 100644 --- a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/conn_comp_enhanced.gsql +++ b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/conn_comp_enhanced.gsql @@ -1,6 +1,19 @@ CREATE QUERY conn_comp_enhanced (SET vertex_types, STRING vt2, STRING edge_type, STRING rev_edge_type, INT output_level) FOR GRAPH MyGraph SYNTAX V2 { -# This query identifies the Connected Components (undirected edges) +/* + Identifies the Connected Components (undirected edges) + + Sample inputs: + vertex_type: claim + vt2: N/A + edge_type: associated + rev_edge_type: reverse_associated + output_level: 1 + + Start from given vertex_type: + (1) Initialize: Label each vertex with its own internal ID + (2) Propagate smaller internal IDs until no more ID changes can be Done +*/ MinAccum @cc_id = 0; //each vertex's tentative component id SumAccum @old_id = 0; @@ -16,7 +29,7 @@ CREATE QUERY conn_comp_enhanced (SET vertex_types, STRING vt2, x.@old_id = getvid(x) ; -# Propagate smaller internal IDs until no more ID changes can be DOne +# Propagate smaller internal IDs until no more ID changes can be Done WHILE (start.size()>0) DO start = SELECT t FROM start:s -((edge_type|rev_edge_type):e)- :t @@ -41,4 +54,4 @@ CREATE QUERY conn_comp_enhanced (SET vertex_types, STRING vt2, IF output_level > 0 THEN PRINT start [start.@cc_id]; END; -} \ No newline at end of file +} diff --git a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/get_community.gsql b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/get_community.gsql index a9147c7..61b1217 100644 --- a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/get_community.gsql +++ b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/get_community.gsql @@ -1,10 +1,22 @@ CREATE QUERY get_community(STRING prescriber_Id, INT community_Id) FOR GRAPH MyGraph SYNTAX V2 { - /* This query finds the vertices and interconnecting edges associated either with the given - * prescriber_Id, or if the prescriber_Id is not provided (empty string), then - * for the given community_Id. - * NOTE: This algorithm requires that the community_Id attribute has been set, - * by running the alg_louvain query, + /* + + Finds the vertices and interconnecting edges associated either with the given + prescriber_Id, or if the prescriber_Id is not provided (empty string), then + for the given community_Id. + NOTE: This algorithm requires that the community_Id attribute has been set, + by running the alg_louvain query, + + Sample inputs: + prescriber_Id: pre78 | pre30 + community_Id: 10 + + Start from all Prescribers: + (1) Select prescribers where the id equals the given prescriber_id and set comm_Id + (2) Get all the vertices and intercomnnecting edges with the give comm_Id + */ + SetAccum @@edge_list; INT comm_Id; @@ -20,11 +32,11 @@ CREATE QUERY get_community(STRING prescriber_Id, INT community_Id) FOR GRAPH MyG END; PRINT comm_Id; - // Get all the vertices and intercomnnecting edges with the give comm_Id + // Get all the vertices and intercomnnecting edges with the given comm_Id comm_vertices = SELECT s FROM start:s -(referral>:e)- :t WHERE s.communityId == comm_Id AND t.communityId == comm_Id ACCUM @@edge_list += e; PRINT comm_vertices[comm_vertices.Prescriber_id]; PRINT @@edge_list; -} \ No newline at end of file +} diff --git a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/insert_all_referrals.gsql b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/insert_all_referrals.gsql index 0e637df..546c78e 100644 --- a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/insert_all_referrals.gsql +++ b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/insert_all_referrals.gsql @@ -1,4 +1,14 @@ CREATE QUERY insert_all_referrals () FOR GRAPH MyGraph SYNTAX V2 { + +/* + + Inserts and returns the total referrals across prescribers + + No inputs + + From all Prescriber vertices: + (1) Select all prescribers, insert referrals, and count the number of referrals +*/ SumAccum @@num_insertions; @@ -8,4 +18,4 @@ CREATE QUERY insert_all_referrals () FOR GRAPH MyGraph SYNTAX V2 { ACCUM @@num_insertions += insert_referrals(s); PRINT @@num_insertions; -} \ No newline at end of file +} diff --git a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/insert_referrals.gsql b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/insert_referrals.gsql index 4b4b93b..8b0b269 100644 --- a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/insert_referrals.gsql +++ b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/insert_referrals.gsql @@ -1,6 +1,22 @@ CREATE QUERY insert_referrals(VERTEX input_prescriber) FOR GRAPH MyGraph RETURNS (INT) SYNTAX V2 { + /* + + Inserts and returns number of referral insertions from unvisited claims + + Sample input: + input_prescriber: pre38 + + Start from the input_prescriber: + (1) Select claims from input_prescriber and mark as visited + (2) Select patients from the claims and update date lists + (3) Select other claims from patients in (2) that are unvisited + (4) Select claims from (3) and insert into referral + + + */ + OrAccum @visited, @is_referred_claim; ListAccum @date_list; SumAccum @@num_insertions; @@ -27,4 +43,4 @@ CREATE QUERY insert_referrals(VERTEX input_prescriber) @@num_insertions += 1; print start_set; RETURN @@num_insertions; -} \ No newline at end of file +} diff --git a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/kcore_decomp.gsql b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/kcore_decomp.gsql index ccd396b..0dbaf15 100644 --- a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/kcore_decomp.gsql +++ b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/kcore_decomp.gsql @@ -1,12 +1,23 @@ CREATE QUERY kcore_decomp(STRING vertex_type, STRING edge_type, INT k_min, INT k_max = -1, BOOL show_membership=false, BOOL show_shells=true) FOR GRAPH MyGraph SYNTAX V2 { -/* Outputs the k-core vertex membership for each value of k from k_min to k_max. - * By definition, for k=0, the vertex set = the entire graph. - * As k increases, V(k) is a subset of V(k-1). - * If k_max < 0, then the query proceeds until it reaches the maximal k-core. - * Calls kcore_sub(). +/* + Outputs the k-core vertex membership for each value of k from k_min to k_max. + By definition, for k=0, the vertex set = the entire graph. + As k increases, V(k) is a subset of V(k-1). + If k_max < 0, then the query proceeds until it reaches the maximal k-core. + Calls kcore_sub(). + + Sample inputs: + vertex_type: claim + edge_type: associated + k_min: 0 + + (1) print just the size of each k-core + (2) print the membership of each k-core + */ + MapAccum> @@core_list_map; // Map SetAccum @@k_core_vertices; // vertex set for k_max ListAccum @@induced_edges; // optional output @@ -47,4 +58,4 @@ CREATE QUERY kcore_decomp(STRING vertex_type, STRING edge_type, INT k_min, END; END; END; -} \ No newline at end of file +} diff --git a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/kcore_max.gsql b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/kcore_max.gsql index 8a166a0..2a44323 100644 --- a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/kcore_max.gsql +++ b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/kcore_max.gsql @@ -1,8 +1,19 @@ CREATE QUERY kcore_max (STRING vertex_type, STRING edge_type, BOOL induced_edges, INT verbosity) FOR GRAPH MyGraph SYNTAX V2 { -/* An implementation of Algorithm 2 in - * Scalable K-Core Decomposition for Static Graphs Using a Dynamic Graph Data Structure, - * Tripathy et al., IEEE Big Data 2018. +/* + An implementation of Algorithm 2 in Scalable K-Core Decomposition for + Static Graphs Using a Dynamic Graph Data Structure, Tripathy et al., + IEEE Big Data 2018. + + Sample inputs: + vertex_type: claim + edge_type: associated + induced_egdes: False + verbosity: 3 + + (1) Get vertex core numbers from kcore_sub query + (2) Print results for k, kcore_vertices, etc + */ MapAccum> @@core_list_map; // Map @@ -25,4 +36,4 @@ CREATE QUERY kcore_max (STRING vertex_type, STRING edge_type, BOOL induced_edges ACCUM @@induced_edges += e; PRINT @@induced_edges; END; -} \ No newline at end of file +} diff --git a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/kcore_sub.gsql b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/kcore_sub.gsql index 4388cfb..6979e26 100644 --- a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/kcore_sub.gsql +++ b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/kcore_sub.gsql @@ -1,8 +1,23 @@ CREATE QUERY kcore_sub (STRING vertexType, STRING edgeType, INT verbosity) FOR GRAPH MyGraph RETURNS (MapAccum>) SYNTAX V2 { -/* An implementation of Algorithm 2 in - * Scalable K-Core Decomposition for Static Graphs Using a Dynamic Graph Data Structure, - * Tripathy et al., IEEE Big Data 2018. - * Returns a map > where are those who are in that k-core but not (k+1)-core +/* + An implementation of Algorithm 2 in Scalable K-Core Decomposition for Static Graphs + Using a Dynamic Graph Data Structure, Tripathy et al., IEEE Big Data 2018. + Returns a map > where are those who are in + that k-core but not (k+1)-core + + Sample inputs: + vertexType: claim + edgeType: associated + verbosity: 5 + + Start from vertexType: + (1) Initialize @deg with vertexType's outdegree + (2) Find vertices whose degree < or = k and mark those vertices individually + (3) Set the core level of those vertices and collect those vertices + (4) Reduce degree of vertices + (5) Print @@core_list_map, list of vertices sorted by increasing core level + + */ @@ -59,4 +74,4 @@ CREATE QUERY kcore_sub (STRING vertexType, STRING edgeType, INT verbosity) FOR G @@core_list_map += (k -> @@Q); END; RETURN @@core_list_map; -} \ No newline at end of file +} diff --git a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/scc.gsql b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/scc.gsql index bad4d8e..bc115af 100644 --- a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/scc.gsql +++ b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/scc.gsql @@ -1,14 +1,27 @@ CREATE QUERY scc (INT iter = 500, INT iter_wcc = 5, INT top_k_dist = 10) FOR GRAPH MyGraph SYNTAX V2 { -/* This query detects strongly connected components based on the following papers: - * https://www.sandia.gov/~apinar/papers/irreg00.pdf - * https://www.sciencedirect.com/science/article/pii/S0743731505000535 - * https://stanford-ppl.github.io/website/papers/sc13-hong.pdf +/* + Detects strongly connected components based on the following papers: + https://www.sandia.gov/~apinar/papers/irreg00.pdf + https://www.sciencedirect.com/science/article/pii/S0743731505000535 + https://stanford-ppl.github.io/website/papers/sc13-hong.pdf - * iter: number of iteration of the algorithm - * iter_wcc: find weakly connected components for the active vertices in this iteration, since the largest sccs are already found after several iterations; usually a small number(3 to 10) - * top_k_dist: top k result in scc distribution + Inputs: + iter: number of iteration of the algorithm + iter_wcc: find weakly connected components for the active vertices in + this iteration, since the largest sccs are already found after + several iterations; usually a small number(3 to 10) + top_k_dist: top k result in scc distribution - * DISTRIBUTED QUERY mode for this query is supported from TG 2.4. + *DISTRIBUTED QUERY mode for this query is supported from TG 2.4. + + Select all Prescribers: + (1) Initialize accumulators + (2) Trim size 1 SCC + (3) Get WCC + (4) Mark forward set + (5) Mark backward set + (5) Return results of SCC detection + */ TYPEDEF TUPLE cluster_num; MapAccum @@cluster_size_map, @@cluster_dist_map; @@ -166,4 +179,4 @@ CREATE QUERY scc (INT iter = 500, INT iter_wcc = 5, INT top_k_dist = 10) FOR GRA PRINT v_all [v_all.@cid]; -} \ No newline at end of file +} diff --git a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/scc_enhanced.gsql b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/scc_enhanced.gsql index b169ba1..c6b2eac 100644 --- a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/scc_enhanced.gsql +++ b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/scc_enhanced.gsql @@ -1,16 +1,29 @@ CREATE QUERY scc_enhanced (INT iter = 500, INT iter_wcc = 5, INT top_k_dist = 10) FOR GRAPH MyGraph SYNTAX V2 { -/* This query detects strongly connected components based on the following papers: - * https://www.sandia.gov/~apinar/papers/irreg00.pdf - * https://www.sciencedirect.com/science/article/pii/S0743731505000535 - * https://stanford-ppl.github.io/website/papers/sc13-hong.pdf +/* + Detects strongly connected components based on the following papers: + https://www.sandia.gov/~apinar/papers/irreg00.pdf + https://www.sciencedirect.com/science/article/pii/S0743731505000535 + https://stanford-ppl.github.io/website/papers/sc13-hong.pdf - * iter: number of iteration of the algorithm - * iter_wcc: find weakly connected components for the active vertices in this iteration, since the largest sccs are already found after several iterations; usually a small number(3 to 10) - * top_k_dist: top k result in scc distribution + Inputs: + iter: number of iteration of the algorithm + iter_wcc: find weakly connected components for the active vertices in this + iteration, since the largest sccs are already found after several + iterations; usually a small number(3 to 10) + top_k_dist: top k result in scc distribution - * DISTRIBUTED QUERY mode for this query is supported from TG 2.4. + * DISTRIBUTED QUERY mode for this query is supported from TG 2.4. + + Select all Prescribers: + (1) Initialize accumulators + (2) Trim size 1 SCC + (3) Get WCC + (4) Mark forward set + (5) Mark backward set + (5) Return results of SCC detection */ + TYPEDEF TUPLE Cluster_Num; MapAccum @@cluster_size_map, @@cluster_dist_map; HeapAccum(top_k_dist, csize DESC) @@cluster_dist_heap; @@ -171,4 +184,4 @@ CREATE QUERY scc_enhanced (INT iter = 500, INT iter_wcc = 5, INT top_k_dist = 10 PRINT v_all [v_all.@cid]; -} \ No newline at end of file +} diff --git a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/select_subgraph.gsql b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/select_subgraph.gsql index 9c29bba..52146c4 100644 --- a/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/select_subgraph.gsql +++ b/Graph-Analytics-Community-Detection-Algorithms/db_scripts/queries/select_subgraph.gsql @@ -1,9 +1,20 @@ CREATE QUERY select_subgraph(STRING vertex_type, STRING edge_type) FOR GRAPH MyGraph SYNTAX V2 { - +/* + Returns edges and vertices of subgraph + + Sample inputs: + vertex_type: claim + edge_type: associated + + From given vertex_type: + (1) Select edge list and target set from given edge_type + +*/ + ListAccum @@edge_list; source_set = {vertex_type}; target_set = SELECT t FROM source_set:s -(edge_type:e)- :t ACCUM @@edge_list += e; PRINT source_set, target_set, @@edge_list; -} \ No newline at end of file +} diff --git a/Graph-Analytics-Shortest-Path-Algorithms/db_scripts/queries/A_README.gsql b/Graph-Analytics-Shortest-Path-Algorithms/db_scripts/queries/A_README.gsql deleted file mode 100644 index 68e209f..0000000 --- a/Graph-Analytics-Shortest-Path-Algorithms/db_scripts/queries/A_README.gsql +++ /dev/null @@ -1,9 +0,0 @@ -CREATE QUERY A_README(/* Parameters here */) FOR GRAPH MyGraph SYNTAX V2 { - - /************************************************************** - * IMPORTANT : PLEASE INSTALL AND RUN THE add_weights QUERY - * BEFORE RUNNING OTHER QUERIES - *************************************************************/ - - PRINT "I read it"; -} \ No newline at end of file diff --git a/Graph-Analytics-Shortest-Path-Algorithms/db_scripts/queries/README.gsql b/Graph-Analytics-Shortest-Path-Algorithms/db_scripts/queries/README.gsql new file mode 100644 index 0000000..00f6fd3 --- /dev/null +++ b/Graph-Analytics-Shortest-Path-Algorithms/db_scripts/queries/README.gsql @@ -0,0 +1,20 @@ +CREATE QUERY README(/* Parameters here */) FOR GRAPH MyGraph SYNTAX V2 { + + STRING name = "Graph-Analytics-Shortest-Path-Algorithms"; + STRING graph_description = "Identify the path through your network with the fewest number of hops."; + + STRING query_order = "1. add_weights, No order"; + + STRING add_weights = "Uses the haversine formula to calculate the distances between " + + "airports by using their latitude and longitude coordinates."; + STRING shortest_ss_no_wt = "Single-Source Shortest Path without weights on edges. " + + "Calculates the shortest distance from the given vertex source to all other " + + "connected vertices, and shows one shortest path between them."; + STRING shortest_ss_pos_wt = "The Bellman-Ford algorithm for single-Source Shortest Path " + + "on directed/undirected graph with positive weight"; + STRING shortest_ss_pos_wt_limits = "The Bellman-Ford algorithm for single-Source Shortest " + + "Path on directed/undirected graph with positive weight with limited number of hops " + + "and distance."; + + PRINT name, graph_description, query_order, add_weights, shortest_ss_no_wt, shortest_ss_pos_wt, shortest_ss_pos_wt_limits; +} diff --git a/Graph-Analytics-Shortest-Path-Algorithms/db_scripts/queries/add_weights.gsql b/Graph-Analytics-Shortest-Path-Algorithms/db_scripts/queries/add_weights.gsql index 7e7e705..8046292 100644 --- a/Graph-Analytics-Shortest-Path-Algorithms/db_scripts/queries/add_weights.gsql +++ b/Graph-Analytics-Shortest-Path-Algorithms/db_scripts/queries/add_weights.gsql @@ -1,7 +1,17 @@ CREATE QUERY add_weights(BOOL overwrite) FOR GRAPH MyGraph SYNTAX V2 { -/* This query uses the haversine formula to calculate the distances -between airports by using their latitude and longitude coordinates. -The calculated distances are measured in miles and are added as edge weights. +/* + Uses the haversine formula to calculate the distances between + airports by using their latitude and longitude coordinates. + The calculated distances are measured in miles and are added as + edge weights. + + Sample Inputs: + overwrite: True | False + + Starting with all airports: + (1) Select target airports and calculate the distance + between source and target if overwrite is True + */ ListAccum @@dont_Change_List; @@ -34,4 +44,4 @@ The calculated distances are measured in miles and are added as edge weights. e.miles = ceil(R * c) END; PRINT @@dont_Change_List; -} \ No newline at end of file +} diff --git a/Graph-Analytics-Shortest-Path-Algorithms/db_scripts/queries/shortest_ss_no_wt.gsql b/Graph-Analytics-Shortest-Path-Algorithms/db_scripts/queries/shortest_ss_no_wt.gsql index c020565..7e5f1f9 100644 --- a/Graph-Analytics-Shortest-Path-Algorithms/db_scripts/queries/shortest_ss_no_wt.gsql +++ b/Graph-Analytics-Shortest-Path-Algorithms/db_scripts/queries/shortest_ss_no_wt.gsql @@ -1,7 +1,21 @@ CREATE QUERY shortest_ss_no_wt(VERTEX source, BOOL display) FOR GRAPH MyGraph SYNTAX V2 { - /* This query is Single-Source Shortest Path without weights on edges. It calculates the shortest distance from the given vertex source to all other connected vertices, and shows one shortest path between them. -The JSON version also show visualization of the network. -The attribute version only store the distance into attribute, not the path. + /* + Single-Source Shortest Path without weights on edges. Calculates the shortest distance + from the given vertex source to all other connected vertices, and shows one shortest + path between them. + + The JSON version also show visualization of the network. + The attribute version only store the distance into attribute, not the path. + + Sample Inputs: + source: airport = "Goroka Airport" + display: True | False + + Start from the source vertex: + (1) Initialize local accumulators + (2) Select connected vertices, calculates shortest distance, and gets shortest + path + (3) Print results if display is True */ MinAccum @dis; @@ -38,4 +52,4 @@ The attribute version only store the distance into attribute, not the path. ACCUM @@edge_Set += e; PRINT @@edge_Set; END; -} \ No newline at end of file +} diff --git a/Graph-Analytics-Shortest-Path-Algorithms/db_scripts/queries/shortest_ss_pos_wt.gsql b/Graph-Analytics-Shortest-Path-Algorithms/db_scripts/queries/shortest_ss_pos_wt.gsql index 3478223..4744f08 100644 --- a/Graph-Analytics-Shortest-Path-Algorithms/db_scripts/queries/shortest_ss_pos_wt.gsql +++ b/Graph-Analytics-Shortest-Path-Algorithms/db_scripts/queries/shortest_ss_pos_wt.gsql @@ -3,9 +3,20 @@ CREATE QUERY shortest_ss_pos_wt (VERTEX source, BOOL display) FOR GRAPH MyGraph and increase the time outdegree */ -/* The Bellman-Ford algorithm for single-Source Shortest Path +/* + The Bellman-Ford algorithm for single-Source Shortest Path on directed/undirected graph with positive weight. It will not detect negative cycle in this algorithm. + + Sample Inputs: + source: airport = "Goroka Airport" + display: True | False + + Start with the source vertex: + (1) Get connected vertices and update local accumulators + (2) Do V-1 iterations: Consider whether each edge lowers the best-known distance. + (3) Calculates shortest paths and displays results if display is True + */ TYPEDEF TUPLE pathTuple; HeapAccum(1, dist ASC) @minPath; @@ -69,4 +80,4 @@ CREATE QUERY shortest_ss_pos_wt (VERTEX source, BOOL display) FOR GRAPH MyGraph ACCUM @@edge_Set += e; PRINT @@edge_Set; END; -} \ No newline at end of file +} diff --git a/Graph-Analytics-Shortest-Path-Algorithms/db_scripts/queries/shortest_ss_pos_wt_limits.gsql b/Graph-Analytics-Shortest-Path-Algorithms/db_scripts/queries/shortest_ss_pos_wt_limits.gsql index 7f6dc21..64a0fec 100644 --- a/Graph-Analytics-Shortest-Path-Algorithms/db_scripts/queries/shortest_ss_pos_wt_limits.gsql +++ b/Graph-Analytics-Shortest-Path-Algorithms/db_scripts/queries/shortest_ss_pos_wt_limits.gsql @@ -1,8 +1,22 @@ CREATE QUERY shortest_ss_pos_wt_limits (VERTEX source, BOOL display, INT maxHops, INT maxDest) FOR GRAPH MyGraph SYNTAX V2 { -/* The Bellman-Ford algorithm for single-Source Shortest Path - on directed/undirected graph with positive weight. - It will not detect negative cycle in this algorithm. +/* + The Bellman-Ford algorithm for single-Source Shortest Path + on directed/undirected graph with positive weight with + limited number of hops and distance. + It will not detect negative cycle in this algorithm. + + Sample Inputs: + source: airport = "Goroka Airport" + display: True | False + maxHops: 3 + maxDest: 10 + + Start with the source vertex: + (1) Get connected vertices and update local accumulators + (2) Do V-1 iterations: Consider whether each edge lowers the best-known distance. + (3) Calculates shortest paths and displays results if display is True + */ TYPEDEF TUPLE pathTuple; HeapAccum(1, dist ASC) @min_Path; @@ -71,4 +85,4 @@ CREATE QUERY shortest_ss_pos_wt_limits (VERTEX source, BOOL display, INT maxHops ACCUM @@edge_Set += e; PRINT @@edge_Set; END; -} \ No newline at end of file +} diff --git a/Graph-Convolutional-Networks/db_scripts/queries/README.gsql b/Graph-Convolutional-Networks/db_scripts/queries/README.gsql index e7988f0..078e28d 100644 --- a/Graph-Convolutional-Networks/db_scripts/queries/README.gsql +++ b/Graph-Convolutional-Networks/db_scripts/queries/README.gsql @@ -1,21 +1,27 @@ -CREATE QUERY README(/* Parameters here */) FOR GRAPH CitationGraph SYNTAX V2 { - /* -The recommendation system can predict the movie ratings based on the latent factor (model-based) method. -To train the latent factor model, run the queries below in sequence -The graph convolutional network (GCN) is applied for node classification. -Specifically in this starter kit, it is used to prediction the class of the papers in a citation network -The hyperparameters in the GCN model is suggested in Thomas N. Kipf and Max Welling, ICLR (2017). -To train the GCN, the order of the queries below must be followed to obtain the useful prediction. +CREATE QUERY README(/* Parameters here */) FOR GRAPH CitationGraph SYNTAX V2 { -1. initialization -2. weight_initialization -3. training -4. predicting +STRING name = "Graph-Convolutional-Networks"; +STRING graph_description = "The recommendation system can predict the movie ratings based on the latent + factor (model-based) method. To train the latent factor model, run the queries below in sequence + The graph convolutional network (GCN) is applied for node classification. + Specifically in this starter kit, it is used to prediction the class of the papers in a citation + network The hyperparameters in the GCN model is suggested in Thomas N. Kipf and Max Welling, ICLR (2017). + To train the GCN, the order of the queries below must be followed to obtain the useful prediction."; -To re-train model using different training data split, -users can modify the initialization query before repeat the steps above. -The order of the queries need to be followed to ensure the correctness of the result. - */ - - PRINT "README worked!"; -} \ No newline at end of file +STRING query_order = "1. initialization, 2. weight_initialization, 3. training, 4. predicting"; +STRING order_note = "To re-train model using different training data split, + users can modify the initialization query before repeat the steps above. + The order of the queries need to be followed to ensure the correctness of the result."; + +STRING initialization = "Initializes weights on edges and attributes before splitting vertices into + training, validation, and testing sets."; +STRING weight_initialization = "Initializes the weights for the neural network."; +STRING training = "Trains the graph convolutional neural network on the training dataset + and evaluates the loss on the validation data and the prediction accuracy on the + testing data."; +STRING predicting = "Predicts the class of the papers in a citation network and + return accuracies."; + +PRINT name, graph_description, query_order, order_note, initialization, weight_initialization, training, predicting; + +} diff --git a/Graph-Convolutional-Networks/db_scripts/queries/initialization.gsql b/Graph-Convolutional-Networks/db_scripts/queries/initialization.gsql index 78d2b54..37a8155 100644 --- a/Graph-Convolutional-Networks/db_scripts/queries/initialization.gsql +++ b/Graph-Convolutional-Networks/db_scripts/queries/initialization.gsql @@ -1,9 +1,22 @@ CREATE QUERY initialization(/* Parameters here */) FOR GRAPH CitationGraph SYNTAX V2 { - /*This query normalizes the weights on CITE edges according to the outdegrees(CITE) of the source - and target vertices, normalizes the weights on HAS edges according the outdegrees(HAS) of the PAPER vertices, - populates the words attribute with (word indx -> weight), - and splits PAPER vertices into testing, validation and training sets, */ - MapAccum @word_Map; +/* + + Initializes weights on edges and attributes before splitting vertices into + training, validation, and testing sets. + + No inputs + + Starting from all PAPER vertices: + (1) Normalize the weights on CITE edges according to the outdegrees(CITE) + of the source and target vertices + (2) Normalize the weights on HAS edges according the outdegrees(HAS) of + the PAPER vertices + (3) Populate the words attribute with (word indx -> weight) + (4) Split PAPER vertices into testing, validation and training sets + +*/ + + MapAccum @word_Map; Papers = {PAPER.*}; @@ -21,4 +34,4 @@ CREATE QUERY initialization(/* Parameters here */) FOR GRAPH CitationGraph SYNTA END, s.words = s.@word_Map; PRINT "initialization finished!"; -} \ No newline at end of file +} diff --git a/Graph-Convolutional-Networks/db_scripts/queries/predicting.gsql b/Graph-Convolutional-Networks/db_scripts/queries/predicting.gsql index 9bd7665..2b062aa 100644 --- a/Graph-Convolutional-Networks/db_scripts/queries/predicting.gsql +++ b/Graph-Convolutional-Networks/db_scripts/queries/predicting.gsql @@ -1,4 +1,20 @@ CREATE QUERY predicting() FOR GRAPH CitationGraph SYNTAX V2 { + +/* + Predicts the class of the papers in a citation network and + returns accuracies. + + No inputs + + Start from WORD vertices: + (1) Load weights into layer 0 and 1 + + Using PAPER vertices: + (1) Foward propagation + (a) Convolve + (b) Hidden layers + +*/ ArrayAccum> @@W_0[1433][16]; #1433 by 16 ArrayAccum> @@W_1[16][7]; #16 by 7 SumAccum @@accurate_cnt; @@ -39,7 +55,7 @@ CREATE QUERY predicting() FOR GRAPH CitationGraph SYNTAX V2 { ACCUM t.@z_0 += product_ArrayAccum_const(s.@zeta_0,e.weight) POST-ACCUM s.@z_0 = ReLU_ArrayAccum(s.@z_0), - // hidden layer0 -> hidden layer1 */ + // hidden layer0 -> hidden layer1 */ s.@zeta_1 += product_Matrix_Vector(@@W_1, s.@z_0) ; @@ -60,4 +76,4 @@ CREATE QUERY predicting() FOR GRAPH CitationGraph SYNTAX V2 { ; PRINT @@accurate_cnt/test_cnt AS accuracy; PRINT @@Graph,Start[Start.@prediction,Start.class_label]; -} \ No newline at end of file +} diff --git a/Graph-Convolutional-Networks/db_scripts/queries/training.gsql b/Graph-Convolutional-Networks/db_scripts/queries/training.gsql index 0b5a363..2a95859 100644 --- a/Graph-Convolutional-Networks/db_scripts/queries/training.gsql +++ b/Graph-Convolutional-Networks/db_scripts/queries/training.gsql @@ -1,17 +1,44 @@ CREATE QUERY training( DOUBLE alpha0 = 0.4, // initial learning rate + BOOL Adam = True, // enable Adam optimizer. If False, constant learning rate will be used + DOUBLE beta1 = 0.9, // hyperparameter for Adam optimizer + DOUBLE beta2 = 0.999, // hyperparameter for Adam optimizer + DOUBLE keepProb = 1.0, // keep probability for the dropout regularization + DOUBLE lambda = 0.00005, // L2 regularization factor + INT MaxIter = 10) // number of epochs + FOR GRAPH CitationGraph SYNTAX V2 { - /*This query trains the graph convolutional neural network on the training dataset - and evaluates the loss on the validation data and the prediction accuracy on the testing data. */ - ArrayAccum> @@W_0[1433][16]; // 1433 by 16 + +/* + Trains the graph convolutional neural network on the training dataset + and evaluates the loss on the validation data and the prediction accuracy on the + testing data. + + No sample inputs + + Start with all WORD vertices: + (1) Load weights into layer 0 and layer 1 + + Using PAPER vertices: + (1) Forward Propagation + (a) Convolve + (b) Hidden layers + (3) Backwards Propgation + (a) Train with loss (Use Adam) + (b) Update weights + + */ + + + ArrayAccum> @@W_0[1433][16]; // 1433 by 16 ArrayAccum> @@W_1[16][7]; // 16 by 7 - ArrayAccum> @@dW_0[1433][16]; // 1433 by 16 + ArrayAccum> @@dW_0[1433][16]; // 1433 by 16 ArrayAccum> @@dW_1[16][7]; // 16 by 7 ArrayAccum> @@VdW_0[1433][16]; // 1433 by 16 ArrayAccum> @@VdW_1[16][7]; // 16 by 7 @@ -21,7 +48,7 @@ FOR GRAPH CitationGraph SYNTAX V2 { SumAccum @@Validation_Loss; SumAccum @@accurate_cnt; - MapAccum @words; + MapAccum @words; ArrayAccum> @zeta_0[16]; ArrayAccum> @zeta_1[7]; ArrayAccum> @dzeta_0[16]; @@ -36,7 +63,8 @@ FOR GRAPH CitationGraph SYNTAX V2 { INT train_cnt = 140; INT val_cnt = 500; INT test_cnt = 1000; - // load weights into @@W_0 and @@W_1 + + // load weights into @@W_0 and @@W_1 WORDs = {WORD.*}; LAYER_0s = SELECT t FROM WORDs:s -(:e)- LAYER_0:t ACCUM @@ -49,110 +77,111 @@ FOR GRAPH CitationGraph SYNTAX V2 { // forward propagation Start = {PAPER.*}; - alpha = alpha0; + alpha = alpha0; -WHILE iter < MaxIter DO - // input -> hidden layer0 - @@Training_Loss = 0; - @@Validation_Loss = 0; - @@accurate_cnt = 0; - @@dW_0.reallocate(1433,16); - @@dW_1.reallocate(16,7); - Start = SELECT s FROM Start:s - POST-ACCUM - s.@zeta_0.reallocate(16), - s.@z_0.reallocate(16), - s.@zeta_1.reallocate(7), - s.@z_1.reallocate(7), - s.@dzeta_0.reallocate(16), - s.@dz_0.reallocate(16), - s.@dzeta_1.reallocate(7), - s.@dz_1.reallocate(7), - s.@words = dropout_SparseVector(s.words, keepProb), - s.@zeta_0 += product_Matrix_SparseVector(@@W_0, s.@words) - ; + WHILE iter < MaxIter DO + // input -> hidden layer0 + @@Training_Loss = 0; + @@Validation_Loss = 0; + @@accurate_cnt = 0; + @@dW_0.reallocate(1433,16); + @@dW_1.reallocate(16,7); + Start = SELECT s FROM Start:s + POST-ACCUM + s.@zeta_0.reallocate(16), + s.@z_0.reallocate(16), + s.@zeta_1.reallocate(7), + s.@z_1.reallocate(7), + s.@dzeta_0.reallocate(16), + s.@dz_0.reallocate(16), + s.@dzeta_1.reallocate(7), + s.@dz_1.reallocate(7), + s.@words = dropout_SparseVector(s.words, keepProb), + s.@zeta_0 += product_Matrix_SparseVector(@@W_0, s.@words) + ; - // convolve - Start = SELECT s FROM Start:s -(CITE:e)- PAPER:t - ACCUM t.@z_0 += product_ArrayAccum_const(s.@zeta_0,e.weight) - POST-ACCUM - s.@z_0 = ReLU_ArrayAccum(s.@z_0), - s.@z_0 = dropout_ArrayAccum(s.@z_0, keepProb), - // hidden layer0 -> hidden layer1 - s.@zeta_1 += product_Matrix_Vector(@@W_1, s.@z_0) - ; - - // convolve - Start = SELECT s FROM Start:s -(CITE:e)- PAPER:t - ACCUM t.@z_1 += product_ArrayAccum_const(s.@zeta_1,e.weight) - POST-ACCUM - s.@y = softmax_ArrayAccum(s.@z_1), - CASE - WHEN s.train THEN - s.@dz_1 = diff_ArrayAccum_oneHotVec(s.@y,s.class_label), - @@Training_Loss += -log(s.@y[s.class_label]) - WHEN s.validation THEN - @@Validation_Loss += -log(s.@y[s.class_label]) - WHEN s.test THEN - INT y_prediction = 0, - DOUBLE maxProb = s.@y[0], - FOREACH i IN RANGE[1,6] DO - IF s.@y[i] > maxProb THEN y_prediction = i, maxProb = s.@y[i] END - END, - IF y_prediction == s.class_label THEN @@accurate_cnt += 1 END - END; + // convolve + Start = SELECT s FROM Start:s -(CITE:e)- PAPER:t + ACCUM t.@z_0 += product_ArrayAccum_const(s.@zeta_0,e.weight) + POST-ACCUM + s.@z_0 = ReLU_ArrayAccum(s.@z_0), + s.@z_0 = dropout_ArrayAccum(s.@z_0, keepProb), + // hidden layer0 -> hidden layer1 + s.@zeta_1 += product_Matrix_Vector(@@W_1, s.@z_0) + ; - // backpropagation - - Training1 = SELECT t FROM Start:s -(CITE:e)- PAPER:t - WHERE s.train - ACCUM t.@dzeta_1 += product_ArrayAccum_const(s.@dz_1,e.weight) - POST-ACCUM - t.@dz_0 += product_Vector_Matrix(@@W_1,t.@dzeta_1), - t.@dz_0 = greater_than_zero_ArrayAccum_ArrayAccum(t.@dz_0, t.@z_0), - FOREACH i IN RANGE[0,15] DO - FOREACH j IN RANGE[0,6] DO - @@dW_1[i][j] += t.@z_0[i]*t.@dzeta_1[j] - END - END - ; - Training0 = SELECT t FROM Training1:s -(CITE:e)- PAPER:t - ACCUM t.@dzeta_0 += product_ArrayAccum_const(s.@dz_0,e.weight) - POST-ACCUM - FOREACH (k,v) IN t.@words DO - FOREACH i IN RANGE[0,15] DO - @@dW_0[k][i] += v*t.@dzeta_0[i] - END - END - ; - @@Training_Loss += lambda*L2Norm_Matrix(@@W_0); - @@dW_0 += product_Matrix_const(@@W_0, lambda); - // @@dW_1 += product_Matrix_const(@@W_1, lambda); // only apply to the first layer - iter = iter + 1; - IF Adam THEN - @@VdW_0 = product_Matrix_const(@@VdW_0,beta1)+product_Matrix_const(@@dW_0,1-beta1); - @@VdW_1 = product_Matrix_const(@@VdW_1,beta1)+product_Matrix_const(@@dW_1,1-beta1); - @@SdW_0 = product_Matrix_const(@@SdW_0,beta2)+product_MatrixSqr_const(@@dW_0,1-beta2); - @@SdW_1 = product_Matrix_const(@@SdW_1,beta2)+product_MatrixSqr_const(@@dW_1,1-beta2); - @@W_0 += AdamGrdient(@@VdW_0,@@SdW_0,iter,alpha,beta1,beta2); - @@W_1 += AdamGrdient(@@VdW_1,@@SdW_1,iter,alpha,beta1,beta2); - ELSE - @@W_0 += product_Matrix_const(@@dW_0, -alpha); - @@W_1 += product_Matrix_const(@@dW_1, -alpha); - END; + // convolve + Start = SELECT s FROM Start:s -(CITE:e)- PAPER:t + ACCUM t.@z_1 += product_ArrayAccum_const(s.@zeta_1,e.weight) + POST-ACCUM + s.@y = softmax_ArrayAccum(s.@z_1), + CASE + WHEN s.train THEN + s.@dz_1 = diff_ArrayAccum_oneHotVec(s.@y,s.class_label), + @@Training_Loss += -log(s.@y[s.class_label]) + WHEN s.validation THEN + @@Validation_Loss += -log(s.@y[s.class_label]) + WHEN s.test THEN + INT y_prediction = 0, + DOUBLE maxProb = s.@y[0], + FOREACH i IN RANGE[1,6] DO + IF s.@y[i] > maxProb THEN y_prediction = i, maxProb = s.@y[i] END + END, + IF y_prediction == s.class_label THEN @@accurate_cnt += 1 END + END; - - PRINT iter,@@Training_Loss/train_cnt AS Training_Loss,@@Validation_Loss/val_cnt AS Validation_Loss,@@accurate_cnt/test_cnt AS accuracy;//,@@train_accurate_cnt,@@val_accurate_cnt; -END; - - // persist @@W_0 and @@W_1 in weights - WORDs = {WORD.*}; - LAYER_0s = SELECT t FROM WORDs:s -(:e)- LAYER_0:t - ACCUM - e.weight = @@W_0[s.indx][t.indx]; + // backpropagation - LAYER_1s = SELECT t FROM LAYER_0s:s -(:e)- LAYER_1:t - ACCUM - e.weight = @@W_1[s.indx][t.indx]; - -} \ No newline at end of file + Training1 = SELECT t FROM Start:s -(CITE:e)- PAPER:t + WHERE s.train + ACCUM t.@dzeta_1 += product_ArrayAccum_const(s.@dz_1,e.weight) + POST-ACCUM + t.@dz_0 += product_Vector_Matrix(@@W_1,t.@dzeta_1), + t.@dz_0 = greater_than_zero_ArrayAccum_ArrayAccum(t.@dz_0, t.@z_0), + FOREACH i IN RANGE[0,15] DO + FOREACH j IN RANGE[0,6] DO + @@dW_1[i][j] += t.@z_0[i]*t.@dzeta_1[j] + END + END + ; + Training0 = SELECT t FROM Training1:s -(CITE:e)- PAPER:t + ACCUM t.@dzeta_0 += product_ArrayAccum_const(s.@dz_0,e.weight) + POST-ACCUM + FOREACH (k,v) IN t.@words DO + FOREACH i IN RANGE[0,15] DO + @@dW_0[k][i] += v*t.@dzeta_0[i] + END + END + ; + @@Training_Loss += lambda*L2Norm_Matrix(@@W_0); + @@dW_0 += product_Matrix_const(@@W_0, lambda); + // @@dW_1 += product_Matrix_const(@@W_1, lambda); // only apply to the first layer + iter = iter + 1; + + IF Adam THEN + @@VdW_0 = product_Matrix_const(@@VdW_0,beta1)+product_Matrix_const(@@dW_0,1-beta1); + @@VdW_1 = product_Matrix_const(@@VdW_1,beta1)+product_Matrix_const(@@dW_1,1-beta1); + @@SdW_0 = product_Matrix_const(@@SdW_0,beta2)+product_MatrixSqr_const(@@dW_0,1-beta2); + @@SdW_1 = product_Matrix_const(@@SdW_1,beta2)+product_MatrixSqr_const(@@dW_1,1-beta2); + @@W_0 += AdamGrdient(@@VdW_0,@@SdW_0,iter,alpha,beta1,beta2); + @@W_1 += AdamGrdient(@@VdW_1,@@SdW_1,iter,alpha,beta1,beta2); + ELSE + @@W_0 += product_Matrix_const(@@dW_0, -alpha); + @@W_1 += product_Matrix_const(@@dW_1, -alpha); + END; + + + PRINT iter,@@Training_Loss/train_cnt AS Training_Loss,@@Validation_Loss/val_cnt AS Validation_Loss,@@accurate_cnt/test_cnt AS accuracy;//,@@train_accurate_cnt,@@val_accurate_cnt; + END; + + // persist @@W_0 and @@W_1 in weights + WORDs = {WORD.*}; + LAYER_0s = SELECT t FROM WORDs:s -(:e)- LAYER_0:t + ACCUM + e.weight = @@W_0[s.indx][t.indx]; + + LAYER_1s = SELECT t FROM LAYER_0s:s -(:e)- LAYER_1:t + ACCUM + e.weight = @@W_1[s.indx][t.indx]; + + } diff --git a/Graph-Convolutional-Networks/db_scripts/queries/weight_initialization.gsql b/Graph-Convolutional-Networks/db_scripts/queries/weight_initialization.gsql index 647653b..fb0e755 100644 --- a/Graph-Convolutional-Networks/db_scripts/queries/weight_initialization.gsql +++ b/Graph-Convolutional-Networks/db_scripts/queries/weight_initialization.gsql @@ -1,6 +1,16 @@ CREATE QUERY weight_initialization() FOR GRAPH CitationGraph SYNTAX V2 { - /*This query initializes the weights for the neural network. - The neural network has 1433 neurons in the input layer, 16 neurons in the hidden layer and 7 neurons in the output layer */ +/* + Initializes the weights for the neural network which has + 1433 neurons in the input layer, 16 neurons in the hidden + layer and 7 neurons in the output layer + + No inputs + + Starting with all WORD vertice: + (1) Calculate weight of all edges between words and layer 0 + (2) Calculate weight of all edges between layer 0 and 1 +*/ + INT input_dim = 1433; INT hidden_dim = 16; INT output_dim = 7; @@ -17,4 +27,4 @@ CREATE QUERY weight_initialization() FOR GRAPH CitationGraph SYNTAX V2 { e.weight = 2*sqrt(6.0/(output_dim+hidden_dim))*(rand_uniform()-0.5); PRINT "weight_initialization finished"; -} \ No newline at end of file +} diff --git a/Healthcare-Graph-Drug-Interaction-FAERS/db_scripts/queries/README.gsql b/Healthcare-Graph-Drug-Interaction-FAERS/db_scripts/queries/README.gsql new file mode 100644 index 0000000..4b37951 --- /dev/null +++ b/Healthcare-Graph-Drug-Interaction-FAERS/db_scripts/queries/README.gsql @@ -0,0 +1,14 @@ +CREATE QUERY README(/* Parameters here */) FOR GRAPH faers SYNTAX V2 { + + STRING name = Healthcare-Graph-Drug-Interaction-FAERS"; + STRING graph_description = "Healthcare example focused on public (FAERS) and private data for pharmaceutical drugs"; + + STRING query_order = "No order"; + + STRING jaccard_nbor_reaction = "Calculates the Jaccard Similarity between a given vertex and every other vertex."; + STRING most_reported_drugs_for_company_v2 = "Returns most reported drugs given a company."; + STRING top_side_effects_for_top_drugs = "Returns most mentioned drugs, the number of reported cases, and their side effects."; + + PRINT name, graph_description, query_order, jaccard_nbor_reaction, most_reported_drugs_for_company_v2, top_side_effects_for_top_drugs; + +} diff --git a/Healthcare-Graph-Drug-Interaction-FAERS/db_scripts/queries/jaccard_nbor_reaction.gsql b/Healthcare-Graph-Drug-Interaction-FAERS/db_scripts/queries/jaccard_nbor_reaction.gsql index 227b1ed..3a75bae 100644 --- a/Healthcare-Graph-Drug-Interaction-FAERS/db_scripts/queries/jaccard_nbor_reaction.gsql +++ b/Healthcare-Graph-Drug-Interaction-FAERS/db_scripts/queries/jaccard_nbor_reaction.gsql @@ -1,15 +1,26 @@ CREATE QUERY jaccard_nbor_reaction(VERTEX source, STRING etype ="hasReactions", INT top_k=100, INT sampSize=100) FOR GRAPH faers SYNTAX v1 { - //example: ReportedCase=100640876 + /* -Calculates the Jaccard Similarity between a given vertex and every other -vertex. A simplified version of the generic purpose algorithm -jacccard_nbor_ss in the GSQL Graph Data Science Library -https://github.com/tigergraph/gsql-graph-algorithms + Calculates the Jaccard Similarity between a given vertex and every other + vertex. A simplified version of the generic purpose algorithm + jacccard_nbor_ss in the GSQL Graph Data Science Library + https://github.com/tigergraph/gsql-graph-algorithms + + Note: In versions 3.5 and earlier, the SAMPLE clause was only supported in Syntax V1, + so this query uses Syntax V1. The default Syntax V2 may be used in future versions that support SAMPLE. + + Sample inputs: + source: ReportedCase=100640876 + + Starting from the source vertex: + (1) Get the outdegree set size from the source + (2) Get neighbors of the source + (3) Select others (neighbors of neighbors) and calculate + Jaccard's Similarity + (4) Orders others by similarity to source -Note: In versions 3.5 and earlier, the SAMPLE clause was only supported in Syntax V1, -so this query uses Syntax V1. The default Syntax V2 may be used in future versions that support SAMPLE. */ SumAccum @intersection_Size, @@set_size_A, @set_size_B; @@ -38,4 +49,4 @@ so this query uses Syntax V1. The default Syntax V2 may be used in future versio PRINT Others; PRINT @@t_Size, Others.size(); -} \ No newline at end of file +} diff --git a/Healthcare-Graph-Drug-Interaction-FAERS/db_scripts/queries/most_reported_drugs_for_company_v2.gsql b/Healthcare-Graph-Drug-Interaction-FAERS/db_scripts/queries/most_reported_drugs_for_company_v2.gsql index 37ab2d2..8d92fda 100644 --- a/Healthcare-Graph-Drug-Interaction-FAERS/db_scripts/queries/most_reported_drugs_for_company_v2.gsql +++ b/Healthcare-Graph-Drug-Interaction-FAERS/db_scripts/queries/most_reported_drugs_for_company_v2.gsql @@ -1,26 +1,35 @@ CREATE QUERY most_reported_drugs_for_company_v2(STRING company_name="PFIZER", INT k=5, STRING role="PS") FOR GRAPH faers SYNTAX v2 { - // Possible values for role: PS, SS, I, C - // PS = primary suspect drug, SS = secondary suspect drug - // C = concomitant, I = interacting + + /* + Returns most reported drugs given a company. + + Sample inputs: + role: PS | SS | I | C + PS = primary suspect drug, SS = secondary suspect drug + C = concomitant, I = interacting + + Starting with all pharma companies: + (1) Find all cases where the given pharma company is the 'mfr_sndr' + (2) Find all drug sequences for the selected cases. + (3) Count occurences of each drug mentioned in each drug sequence. + (4) Print top drugs +*/ // Keep count of how many times each drug is mentioned. SumAccum @num_Cases; - // 1. Find all cases where the given pharma company is the 'mfr_sndr' Company = {PharmaCompany.*}; Cases = SELECT c FROM Company:s -(relatedTo:e)- ReportedCase:c WHERE s.mfr_sndr == company_name ; - // 2. Find all drug sequences for the selected cases. DrugSeqs = SELECT ds FROM Cases:c -(hasSequences:e)- DrugSequence:ds WHERE (role == "" OR ds.role_cod == role) ; - // 3. Count occurences of each drug mentioned in each drug sequence. TopDrugs = SELECT d FROM DrugSeqs:ds -(hasDrugs:e)- Drug:d ACCUM d.@num_Cases += 1 @@ -29,4 +38,4 @@ CREATE QUERY most_reported_drugs_for_company_v2(STRING company_name="PFIZER", ; PRINT TopDrugs; -} \ No newline at end of file +} diff --git a/Healthcare-Graph-Drug-Interaction-FAERS/db_scripts/queries/top_side_effects_for_top_drugs.gsql b/Healthcare-Graph-Drug-Interaction-FAERS/db_scripts/queries/top_side_effects_for_top_drugs.gsql index 43fe8f1..82efa91 100644 --- a/Healthcare-Graph-Drug-Interaction-FAERS/db_scripts/queries/top_side_effects_for_top_drugs.gsql +++ b/Healthcare-Graph-Drug-Interaction-FAERS/db_scripts/queries/top_side_effects_for_top_drugs.gsql @@ -1,9 +1,25 @@ CREATE QUERY top_side_effects_for_top_drugs(STRING company_name="PFIZER", INT k=5, STRING role="PS") FOR GRAPH faers SYNTAX v2 { - // Possible values for role: PS, SS, I, C + + /* + Returns most mentioned drugs, the number of reported cases, and their + side effects. + + Sample inputs: + role: PS | SS | I | C // PS = primary suspect drug, SS = secondary suspect drug // C = concomitant, I = interacting + Starting with all pharma companies: + (1) Find all cases where the given pharma company is the 'mfr_sndr' + (2) For each case, attach a list of its reactions. + (3) Find all drug sequences for the selected cases, and transfer + the reaction list to the drug sequence. + (4) Count occurences of each drug mentioned in each drug sequence. + Also count the occurences of each reaction. + (5) Find only the Top K side effects for each selected Drug. +*/ + // Define a heap which sorts the reaction map (below) by count. TYPEDEF TUPLE tally; HeapAccum(k, cnt DESC) @top_Reactions; @@ -13,26 +29,21 @@ CREATE QUERY top_side_effects_for_top_drugs(STRING company_name="PFIZER", SumAccum @num_Cases; MapAccum @reaction_Tally; - // 1. Find all cases where the given pharma company is the 'mfr_sndr' Company = {PharmaCompany.*}; Cases = SELECT c FROM Company:s -(relatedTo:e)- ReportedCase:c WHERE s.mfr_sndr == company_name; - // 2. For each case, attach a list of its reactions. Tally = SELECT r FROM Cases:c -(hasReactions:e)- Reaction:r ACCUM c.@reaction_List += r.pt; - // 3. Find all drug sequences for the selected cases, and transfer - // the reaction list to the drug sequence. + DrugSeqs = SELECT ds FROM Cases:c -(hasSequences:e)- DrugSequence:ds WHERE (role == "" OR ds.role_cod == role) ACCUM ds.@reaction_List = c.@reaction_List; - // 4. Count occurences of each drug mentioned in each drug sequence. - // Also count the occurences of each reaction. TopDrugs = SELECT d FROM DrugSeqs:ds -(hasDrugs:e)- Drug:d ACCUM d.@num_Cases += 1, @@ -42,7 +53,6 @@ CREATE QUERY top_side_effects_for_top_drugs(STRING company_name="PFIZER", ORDER BY d.@num_Cases DESC LIMIT k; - // 5. Find only the Top K side effects for each selected Drug. TopDrugs = SELECT d FROM TopDrugs:d ACCUM @@ -53,4 +63,4 @@ CREATE QUERY top_side_effects_for_top_drugs(STRING company_name="PFIZER", PRINT TopDrugs[TopDrugs.prod_ai, TopDrugs.@num_Cases, TopDrugs.@top_Reactions]; -} \ No newline at end of file +} diff --git a/In-Database-Machine-Learning-Recommendation/db_scripts/queries/README.gsql b/In-Database-Machine-Learning-Recommendation/db_scripts/queries/README.gsql index 72b5de7..6786cf4 100644 --- a/In-Database-Machine-Learning-Recommendation/db_scripts/queries/README.gsql +++ b/In-Database-Machine-Learning-Recommendation/db_scripts/queries/README.gsql @@ -1,19 +1,32 @@ CREATE QUERY README(/* Parameters here */) FOR GRAPH Recommender SYNTAX V2 { /* -The recommendation system can predict the movie ratings based on the latent factor (model-based) method. -To train the latent factor model, run the queries below in sequence -1. splitData -2. normalization -3. initialization -4. training + To re-train model using different training data split, + the data need to be reloaded before repeating the steps. + The order of the queries need to be followed to ensure the + correctness of the result. +*/ -To test the model and use it for recommendation, run the queries below -1. test -2. recommend +STRING name = "In Database Machine Learning Recommendation"; +STRING graph_description = "Provides content and products suggestions" + + "using an in-database machine learning recommendation system. " + + "The recommendation system can predict the movie ratings based " + + "on the latent factor (model-based) method."; + +STRING query_train_order = "1. splitData, 2. normalization, 3. initialization, 4. training"; +STRING query_test_order = "1. test, 2. recommend"; -To re-train model using different training data split, the data need to be reloaded before repeat the steps above. -The order of the queries need to be followed to ensure the correctness of the result. -*/ +STRING splitData = "Splits rating data into validation set and training set " + + "with 30% data for testing."; +STRING normalization = "Normalizes the ratings by subtracting each rating by " + + "the average rating of the movie from the training data."; +STRING initialization = "Initializes the latent factor vectors for the users and the movies + by a normal distributed random number generator."; +STRING training = "Trains recommender model using the gradient descent algorithm."; +STRING test = "Outputs the real ratings provided by a user together with the predicted rating by the model."; +STRING recommend = "Outputs the top-10 movies recommended to a user"; +STRING cal_avg_rating = "Calculates the average rating across all movies."; + + +PRINT name, graph_description, query_train_order, query_test_order, splitData, normalization, initialization, training, test, recommend,cal_avg_rating; - PRINT "README works!"; -} \ No newline at end of file +} diff --git a/In-Database-Machine-Learning-Recommendation/db_scripts/queries/cal_avg_rating.gsql b/In-Database-Machine-Learning-Recommendation/db_scripts/queries/cal_avg_rating.gsql index e08196b..e88b478 100644 --- a/In-Database-Machine-Learning-Recommendation/db_scripts/queries/cal_avg_rating.gsql +++ b/In-Database-Machine-Learning-Recommendation/db_scripts/queries/cal_avg_rating.gsql @@ -1,5 +1,14 @@ CREATE QUERY cal_avg_rating(/* Parameters here */) FOR GRAPH Recommender SYNTAX V2 { - /* Write query logic here */ + /* + Returns average rating of movies + + No inputs: + + Starting with all movies: + (1) Select movies with edges to users + (2) Accum the average rating from the edges + +*/ AvgAccum @avg_rating; Start = {MOVIE.*}; Start = SELECT s FROM Start:s -(rate:e)- USER:t @@ -8,4 +17,4 @@ CREATE QUERY cal_avg_rating(/* Parameters here */) FOR GRAPH Recommender SYNTAX POST-ACCUM s.avg_rating = s.@avg_rating; PRINT Start; -} \ No newline at end of file +} diff --git a/In-Database-Machine-Learning-Recommendation/db_scripts/queries/initialization.gsql b/In-Database-Machine-Learning-Recommendation/db_scripts/queries/initialization.gsql index 16e4697..a66a837 100644 --- a/In-Database-Machine-Learning-Recommendation/db_scripts/queries/initialization.gsql +++ b/In-Database-Machine-Learning-Recommendation/db_scripts/queries/initialization.gsql @@ -1,8 +1,23 @@ CREATE QUERY initialization(INT num_latent_factors = 19) FOR GRAPH Recommender SYNTAX V2 { +/* + Initializes the latent factor vectors for the users and the movies + by a normal distributed random number generator. + + No inputs + + Starting with all MOVIE vertices: + (1) Assign a random number to the latent factor vectors of the + movies + + Using all USER vertices: + (1) Assign a random number to the latent factor vectors of the + users + +*/ // This query initialize the latent factor vectors for the users and the movies // The elements in the latent factor vectors are initialized by a normal distributed random number generator // The query inputs are the standard deviation and the mean of the normal distribution - + ListAccum @init; //The length of the latent factor vector (i.e. the number of features) is set as 19. This number has to be the same as the num_latent_factors in the training query @@ -25,4 +40,4 @@ CREATE QUERY initialization(INT num_latent_factors = 19) FOR GRAPH Recommender S s.theta = s.@init; PRINT "Initialization Completed"; -} \ No newline at end of file +} diff --git a/In-Database-Machine-Learning-Recommendation/db_scripts/queries/normalization.gsql b/In-Database-Machine-Learning-Recommendation/db_scripts/queries/normalization.gsql index e197e83..d09f8fc 100644 --- a/In-Database-Machine-Learning-Recommendation/db_scripts/queries/normalization.gsql +++ b/In-Database-Machine-Learning-Recommendation/db_scripts/queries/normalization.gsql @@ -1,6 +1,18 @@ -CREATE QUERY normalization(/* Parameters here */) FOR GRAPH Recommender SYNTAX V2 { - // This query normalizes the ratings by substracting each rating by the average rating of the movie. - // The average rating of each movie is computed from the training data +CREATE QUERY normalization(/* Parameters here */) FOR GRAPH Recommender SYNTAX V2 { +/* + Normalizes the ratings by subtracting each rating by the + average rating of the movie from the training data. + + No inputs + + Starting with all MOVIE vertices: + (1) Select movies from the training data and calculate + the average rating + (2) Subtract the average rating from each movie's + rating + +*/ + AvgAccum @avg_rating; Start = {MOVIE.*}; Start = SELECT s FROM Start:s -(rate:e)- USER:t @@ -15,4 +27,4 @@ CREATE QUERY normalization(/* Parameters here */) FOR GRAPH Recommender SYNTAX V Start = SELECT s FROM Start:s -(rate:e)- USER:t ACCUM e.rating = e.rating - s.@avg_rating; //substract each rating by the average rating of the movie -} \ No newline at end of file +} diff --git a/In-Database-Machine-Learning-Recommendation/db_scripts/queries/recommend.gsql b/In-Database-Machine-Learning-Recommendation/db_scripts/queries/recommend.gsql index e9bcf01..e72be8c 100644 --- a/In-Database-Machine-Learning-Recommendation/db_scripts/queries/recommend.gsql +++ b/In-Database-Machine-Learning-Recommendation/db_scripts/queries/recommend.gsql @@ -1,4 +1,16 @@ CREATE QUERY recommend(VERTEX user) FOR GRAPH Recommender SYNTAX V2 { +/* + Outputs the top-10 movies recommended to a user. The movies are recommended + based on the rating prediction. + + Sample inputs: + user: 1 | 2 + + Starting with all movies: + (1) Compute the rating prediciton based on the model + (2) Get top 10 predicted ratings descending + +*/ //This query output the top-10 movies recommended to a user //The movies are recommended based on the rating prediction SumAccum @predicted_rating; @@ -12,4 +24,4 @@ CREATE QUERY recommend(VERTEX user) FOR GRAPH Recommender SYNTAX V2 { LIMIT 10; PRINT "Recommendation (based on model)"; PRINT MOVIEs[MOVIEs.name, MOVIEs.@predicted_rating]; -} \ No newline at end of file +} diff --git a/In-Database-Machine-Learning-Recommendation/db_scripts/queries/splitData.gsql b/In-Database-Machine-Learning-Recommendation/db_scripts/queries/splitData.gsql index 8869de1..5395a76 100644 --- a/In-Database-Machine-Learning-Recommendation/db_scripts/queries/splitData.gsql +++ b/In-Database-Machine-Learning-Recommendation/db_scripts/queries/splitData.gsql @@ -1,8 +1,17 @@ -CREATE QUERY split_data() FOR GRAPH Recommender SYNTAX V2 { - // This query split rating data into validation set and training set. - // The fraction of testing data is set to be 30%. (i.e. 30% of the rating data will be used for model validation - // and the rest 70% will be used for model training). - // This query also output the size of total data set, the validation data set and the training data set. +CREATE QUERY split_data() FOR GRAPH Recommender SYNTAX V2 { +/* + Splits rating data into validation set and training set with 30% data + for testing. + + No inputs + + Starting from all USER vertices: + (1) Select the validation data and training data + (2) Returns the size of the total data set, validation data set, and + training data set + +*/ + SumAccum @@cnt_total; SumAccum @@cnt_validation; SumAccum @@cnt_training; @@ -26,4 +35,4 @@ CREATE QUERY split_data() FOR GRAPH Recommender SYNTAX V2 { PRINT @@cnt_total,@@cnt_validation,@@cnt_training; // print out the size of total data set, the validation data set and the training data set -} \ No newline at end of file +} diff --git a/In-Database-Machine-Learning-Recommendation/db_scripts/queries/test.gsql b/In-Database-Machine-Learning-Recommendation/db_scripts/queries/test.gsql index 5cb9fcb..d9bff61 100644 --- a/In-Database-Machine-Learning-Recommendation/db_scripts/queries/test.gsql +++ b/In-Database-Machine-Learning-Recommendation/db_scripts/queries/test.gsql @@ -1,4 +1,19 @@ CREATE QUERY test(VERTEX user) FOR GRAPH Recommender SYNTAX V2 { + +/* + Outputs the real ratings provided by a user together with the predicted rating + by the model + + Sample Input: + user: 1 | 2 + + Starting with a user vertex: + (1) Select movies with edges to the user + (2) Sum accum to the predicted rating and average rating of the movie + (3) Print the real ratings by the user and the predicted model + ratings + +*/ //This query output the real ratings provided by a user together with the predicted rating by the model //The query input is a user id //The query output is all the ratings given by the user and the ratings prediction @@ -12,4 +27,4 @@ CREATE QUERY test(VERTEX user) FOR GRAPH Recommender SYNTAX V2 { t.@real_rating += e.rating+t.avg_rating; PRINT Start[Start.@real_rating,Start.@predicted_rating]; -} \ No newline at end of file +} diff --git a/In-Database-Machine-Learning-Recommendation/db_scripts/queries/training.gsql b/In-Database-Machine-Learning-Recommendation/db_scripts/queries/training.gsql index 5096df1..7e003f2 100644 --- a/In-Database-Machine-Learning-Recommendation/db_scripts/queries/training.gsql +++ b/In-Database-Machine-Learning-Recommendation/db_scripts/queries/training.gsql @@ -1,4 +1,16 @@ CREATE QUERY training(DOUBLE learning_rate = 0.001, DOUBLE regularization_factor = 0.00005, INT Iter=100) FOR GRAPH Recommender SYNTAX V2 { + /* + Trains recommender model using the gradient descent algorithm. + + No inputs + + Starting with all movies vertices and all user vertices: + (1) Pass x and theta to local accum + (2) Obtain the latent factor vectors using gradient descent algorithm + (3) Outputs the root mean square error (RMSE) for every iteration + */ + + //This query trains the recommender model using gradient descent algorithm //The number of features is set as 19. This number has to be the same as the num_latent_factors in the initialization query //The query inputs are the learning rate, regularization_factor and the number of training iterations @@ -88,4 +100,4 @@ CREATE QUERY training(DOUBLE learning_rate = 0.001, DOUBLE regularization_factor END, s.theta = s.@tmp; -} \ No newline at end of file +} diff --git a/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/README.gsql b/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/README.gsql index f8b7d47..cfa4f88 100644 --- a/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/README.gsql +++ b/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/README.gsql @@ -19,7 +19,7 @@ CREATE QUERY README() FOR GRAPH Entity_Resolution SYNTAX V2 { Method 2: Exact and approximate matching 1a. Run initialize_users to create a tentative User linked to each Account. 1b. Run util_set_weights to load weights used to calibrate the scoring. - 2a. Run connect_weighted_match to link Users whose matching attribute + 2a. Run connect_weighted_match to link Users whose matching attribute values score enough points. 2b. Run score_similar_attributes to add additional points for approximate matches. @@ -35,11 +35,35 @@ CREATE QUERY README() FOR GRAPH Entity_Resolution SYNTAX V2 { *** Once the Entity Resolution is complete, you can see some of the results: - get_entity_subgraph: finds the User of a given Account, the other + get_account_subgraph: finds the User of a given Account, the other Accounts of that User, and the attribute vertices of this User. recommend_videos: find all the accounts linked via entity resolution to the input account, then list videos that have the most features in common with the videos already watched by this user. */ - PRINT version; -} \ No newline at end of file + + STRING name = "In-Database-Machine-Learning-for-Big-Data-Entity-Resolution"; + STRING graph_description = "Finds Accounts that share many of the same or similar" + + "personal attributes and therefore seem to represent the same User."; + + STRING Jaccard_similarity_order = "1. initialize_users, 2. connect_jaccard_sim, 3. merge_connected_users, 4. repeat 2. and 3."; + STRING Exact_and_approximate_matching_order = "1.initialize_users, 2. util_set_weights, 3. connect_weighted_match, " + + "4. score_similar_attributes, 5. merge_similar_users, 6. repeat 3,4,5"; + + STRING initialize_users = "Create a user vertex for each account and connects the attributes of the account to the user."; + STRING connect_jaccard_sim = "Calculate Jaccard similarity between each vertex and every other vertex."; + STRING merge_connected_users = "Connect users having sufficient shared attributes."; + STRING util_set_weights = "Sets all weights to calibrate the scoring."; + STRING connect_weighted_match = "Connect users that have sufficient shared attributes."; + STRING score_similar_attributes = "Considering only User-User pairs where this is already some match of attribute values, " + + compare their names and their addresses using JaroWinkler distance."; + STRING merge_similar_users = "N/A"; + STRING get_account_subgraph = "Get subgraph given set of account ids"; + STRING recommend_videos = "Recommend videos to User by most genres or keywords in common with the played videos."; + + + PRINT name, version, graph_description, Jaccard_similarity_order, Exact_and_approximate_matching_order; + PRINT initialize_users, connect_jaccard_sim, merge_connected_users; + PRINT util_set_weights, connect_weighted_match, score_similar_attributes, merge_similar_users; + PRINT get_account_subgraph, recommend_videos; +} diff --git a/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/connect_jaccard_sim.gsql b/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/connect_jaccard_sim.gsql index 330b7d6..2decb12 100644 --- a/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/connect_jaccard_sim.gsql +++ b/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/connect_jaccard_sim.gsql @@ -2,9 +2,16 @@ CREATE QUERY connect_jaccard_sim (FLOAT threshold=0.5, INT topK=100, BOOL verbose=FALSE) FOR GRAPH Entity_Resolution SYNTAX V2 { /* -Calculate Jaccard similarity between each vertex and every other vertex. - Jaccard(set A, set B) = overlap_size / (size_A + size_B - overlap_size) + Calculate Jaccard similarity between each vertex and every other vertex. + Jaccard(set A, set B) = overlap_size / (size_A + size_B - overlap_size) + + No inputs + + (1) Calculate the number of eligible neighbors of each vertex + (2) Find paths from UserA->neighbor->UserB: count A&B's common neighbors + (3) Calculate Jaccard(A,B). Keep the scores > threshold */ + TYPEDEF TUPLE SimilarityTuple; MapAccum @@deg; // degree of each VERTEX MapAccum, INT> @intersection; // num neighbors in common @@ -15,13 +22,13 @@ Calculate Jaccard similarity between each vertex and every other vertex. @@etype_list += ["User_Last_Name","User_Address","User_Device"]; IF verbose THEN PRINT @@etype_list; END; - // Calculate the number of eligible neighbors of each vertex + Start = SELECT s FROM User:s ACCUM FOREACH e IN @@etype_list DO @@deg += (s -> s.outdegree(e)) END; - // Find paths from UserA->neighbor->UserB: count A&B's common neighbors + Others = SELECT B FROM User:A -()- (IP|Email|Phone|Last_Name|Address|Device):n -()- User:B @@ -29,9 +36,10 @@ Calculate Jaccard similarity between each vertex and every other vertex. ACCUM A.@intersection += (B -> 1), // tally each path A->B @@path_count += 1; + IF verbose THEN PRINT @@path_count; END; - // Calculate Jaccard(A,B). Keep the scores > threshold + Result = SELECT A FROM User:A ACCUM FOREACH (B, overlap) IN A.@intersection DO FLOAT score = overlap*1.0/(@@deg.get(A) + @@deg.get(B) - overlap), @@ -46,4 +54,4 @@ Calculate Jaccard similarity between each vertex and every other vertex. PRINT @@jaccard_heap; PRINT to_string(@@insert_count) + " SameAs edges inserted" AS endMsg; -} \ No newline at end of file +} diff --git a/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/connect_weighted_match.gsql b/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/connect_weighted_match.gsql index 67cb065..321be25 100644 --- a/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/connect_weighted_match.gsql +++ b/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/connect_weighted_match.gsql @@ -1,20 +1,29 @@ CREATE QUERY connect_weighted_match(float threshold=0.2, bool verbose=false) FOR GRAPH Entity_Resolution SYNTAX v2{ /* - Connect users that have sufficient shared attributes. The linking score between - two users is defined as the weighted sum of their shared attributes. - Two users will be linked if the linking sc ore is above the threshold. + Connect users that have sufficient shared attributes. The linking score between + two users is defined as the weighted sum of their shared attributes. + Two users will be linked if the linking sc ore is above the threshold. + + No inputs + + (1) Copy Weights map to a global accumulator, so it's always available + (2) For each attribute connected to users, store the weight to each + user in the score map + (3) For each pair of users connected via attributes, aggregate all attribute + weights. + (4) Connect the users with a SameAs edge if the score > threshold. + */ MapAccum, SumAccum> @score; MapAccum @@wt_map; SumAccum @@insert_count, @@attr_count; - // Copy Weights map to a global accumulator, so it's always available + Wt = SELECT w FROM Weights:w POST-ACCUM @@wt_map += w.wt_map; IF verbose THEN PRINT @@wt_map; END; - // For each attribute connected to users, - // store the weight to each user in the score map + Attributes = SELECT attr FROM User:usr -((User_IP|User_Email|User_Last_Name|User_Phone|User_Address|User_Device):e)- :attr ACCUM @@ -22,8 +31,6 @@ CREATE QUERY connect_weighted_match(float threshold=0.2, bool verbose=false) FOR attr.@score += (usr -> @@wt_map.get(e.type)); IF verbose THEN PRINT @@attr_count; END; - // For each pair of users connected via attributes, aggregate all attribute - // weights. Connect the users with a SameAs edge if the score > threshold. Attrs = SELECT attr FROM Attributes:attr -((User_IP|User_Email|User_Last_Name|User_Phone|User_Address|User_Device):e)- :usr ACCUM @@ -38,4 +45,4 @@ CREATE QUERY connect_weighted_match(float threshold=0.2, bool verbose=false) FOR ; PRINT to_string(@@insert_count) + " SameAs edges inserted; " + "s2_connect_weighted_match: Done" AS endMsg; -} \ No newline at end of file +} diff --git a/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/get_account_subgraph.gsql b/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/get_account_subgraph.gsql index 3166090..67f0026 100644 --- a/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/get_account_subgraph.gsql +++ b/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/get_account_subgraph.gsql @@ -1,6 +1,17 @@ CREATE QUERY get_account_subgraph(SET account_ids, BOOL include_attributes=FALSE) FOR GRAPH Entity_Resolution SYNTAX v2 { + /* + Get subgraph given set of account ids + + Sample inputs: + account_ids: 1 | 2 | 3 + + (1) Get vertex set from input set + (2) Select users connected to accounts set and collect + Has_Account edges + */ + ListAccum @@edges_to_display; INT numAccounts; INT numUsers; @@ -11,4 +22,4 @@ CREATE QUERY get_account_subgraph(SET account_ids, BOOL include_attribut PRINT "get_account_subgraph works!"; -} \ No newline at end of file +} diff --git a/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/initialize_users.gsql b/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/initialize_users.gsql index 9d0a247..7f05595 100644 --- a/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/initialize_users.gsql +++ b/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/initialize_users.gsql @@ -1,7 +1,13 @@ CREATE QUERY initialize_users() FOR GRAPH Entity_Resolution SYNTAX v2 { -// Create a user vertex for each account and connecs the attributes -// (IP, Email, Device, Phone, Last_Name, address) of the account to the user. - +/* + Create a user vertex for each account and connects the attributes + (IP, Email, Device, Phone, Last_Name, address) of the account to the user. + + No inputs + + (1) Initialize each account with a user + (2)Connect the User to all the attributes of their account +*/ // Initialize each account with a user Accounts = SELECT s FROM Account:s WHERE s.outdegree("Has_Account")==0 @@ -35,4 +41,4 @@ CREATE QUERY initialize_users() FOR GRAPH Entity_Resolution SYNTAX v2 { INSERT INTO User_Address VALUES(s.id, attr); // Note: Insertions will not be visible until after the query completes. PRINT "s1_initialize_users: Done" AS endMsg; -} \ No newline at end of file +} diff --git a/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/merge_connected_users.gsql b/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/merge_connected_users.gsql index 036750b..f4b2342 100644 --- a/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/merge_connected_users.gsql +++ b/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/merge_connected_users.gsql @@ -1,11 +1,21 @@ CREATE QUERY merge_connected_users(FLOAT threshold=1.0, BOOL verbose=FALSE) FOR GRAPH Entity_Resolution SYNTAX V2 { -/*Connect users having sufficient shared attributes. The linking score between - Group users connected by SameAs edges: - 1. Find connected users using the connected component algorithm. - 2. In each component, select a lead user. - 3. In each component, connect all attributes from other users to the lead user - 4. Delete the users that are not the lead user. - */ +/* + Connect users having sufficient shared attributes using the connected + component algorithm. The linking score between Group users is connected + by SameAs edges. + + + No inputs + + Starting with all Users: + (1) Initialize each user with itself as the lead of the component + (2) Assign the min vertex ID of a connected component to every other member. + (3) The rest of the query merge all the users in each connected component into one vertex. + (4) Transfer each of the Attribute vertices (IPs, Emails, Phones, Last_Names, + Addresses, Devices) to the lead user. + (5) Delete the non-lead User vertices + + */ // MinAccum selects the vertex with the minimum internal ID MinAccum> @min_user_id; @@ -84,7 +94,7 @@ CREATE QUERY merge_connected_users(FLOAT threshold=1.0, BOOL verbose=FALSE) FOR VERTEX lead_usr = s.@min_user_id, INSERT INTO User_Device VALUES (lead_usr, t), DELETE (e) - ; + ; // 4. Delete the non-lead User vertices Not_lead = SELECT s FROM Not_lead:s POST-ACCUM @@ -92,4 +102,4 @@ CREATE QUERY merge_connected_users(FLOAT threshold=1.0, BOOL verbose=FALSE) FOR // Print whether any grouping has been performed PRINT converged; -} \ No newline at end of file +} diff --git a/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/recommend_videos.gsql b/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/recommend_videos.gsql index 67d44f8..32923cf 100644 --- a/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/recommend_videos.gsql +++ b/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/recommend_videos.gsql @@ -1,16 +1,25 @@ CREATE QUERY recommend_videos(vertex inputAcc, int k=5) FOR GRAPH Entity_Resolution SYNTAX v2 { - /* Recommend videos to User: - 1. Find all the accounts related to the input account - (according to the entity resolution). - 2. Find all the videos these accounts have played. - 3. Find all the un-watched videos which have the most genres - or keywords in common with the played videos. - ********************************************************* - * Example parameters: - * Account : 407 - * k : 5 - **********************************************************/ + +/* + Recommend videos to User: + 1. Find all the accounts related to the input account + (according to the entity resolution). + 2. Find all the videos these accounts have played. + 3. Find all the un-watched videos which have the most genres + or keywords in common with the played videos. + + Sample inputs: + Account : 407 + k : 5 + + Start from the input account inputAcc: + (1) Get all the accounts linked through the same user as the source account + (2) Get the videos played by the accounts of interest + (3) Tag each genre or keyword of a video played by this User + (4) Count genres or keywords an unwatched video has in common with tagged videos + (5) Show connections (edges) to the features of the recommended videos +*/ SetAccum> @@connected_accts; MapAccum @map; @@ -20,27 +29,24 @@ FOR GRAPH Entity_Resolution SYNTAX v2 { Source_acct = {inputAcc}; - // Get all the accounts linked through the same user as the source account + Related_accts = SELECT acct FROM Source_acct:s -(Has_Account:e1)- User:u -(Has_Account:e2)- Account:acct ACCUM @@edge_list += e1, @@edge_list += e2; - // This block is just for collecting edges to display Attributes = SELECT attr FROM Related_accts:s-((Has_IP|Has_Email|Has_Last_Name|Has_Phone|Has_Address|Has_Device):e)-:attr ACCUM @@edge_list += e; - // Get the videos played by the accounts of intereset Played_vids = SELECT t FROM Related_accts:s -(Has_Play_Event:e1)- :v -(Play_Video:e2)- :t ACCUM t.@cnt += 1, // tag each video played by this User @@edge_list += e1, @@edge_list += e2; - // Tag each genre or keyword of a video played by this User Video_features = SELECT t FROM Played_vids:s-((Has_Genre|Has_Keyword):e)-:t ACCUM t.@cnt += s.@cnt, @@edge_list += e; - // Count genres or keywords an unwatched video has in common with tagged videos + Recommended_vids = SELECT t FROM Video_features:s-((Has_Genre|Has_Keyword):e)-:t WHERE t.@cnt == 0 ACCUM t.@cnt += s.@cnt, t.@map += (s->s.@cnt) @@ -48,9 +54,8 @@ FOR GRAPH Entity_Resolution SYNTAX v2 { PRINT Recommended_vids; - // Show connections (edges) to the features of the recommended videos Recommended_vids = SELECT s FROM Recommended_vids:s-((Has_Genre|Has_Keyword):e)-:t ACCUM @@edge_list += e; PRINT @@edge_list; -} \ No newline at end of file +} diff --git a/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/score_similar_attributes.gsql b/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/score_similar_attributes.gsql index eb88156..d128550 100644 --- a/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/score_similar_attributes.gsql +++ b/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/score_similar_attributes.gsql @@ -2,10 +2,23 @@ CREATE QUERY score_similar_attributes( bool do_last_name=TRUE, bool do_address=TRUE, bool print_only=FALSE) FOR GRAPH Entity_Resolution SYNTAX v2 { - /* Considering only User-User pairs where this is already some match - of attribute values, compare their names and their addresses using - JaroWinkler distance (score [0,1] for [nothing in common,identical]). - Use this to add a prorated weight to their existing similarity scores. + + /* + Considering only User-User pairs where this is already some match + of attribute values, compare their names and their addresses using + JaroWinkler distance (score [0,1] for [nothing in common,identical]). + Use this to add a prorated weight to their existing similarity scores. + + No inputs + + (1) Get weights of Last_Name and Address from the global Weight vertex + (2) Find all linked users (order doesn't matter), plus each user's last name + (3) If names aren't identical compute JaroWinkler * weight + (4) Find all linked users (order doesn't matter), plus each user's address + (5) If addresses aren't identical compute JaroWinkler * weight + (6) Add the new similarity scores to the existing scores + + */ TYPEDEF TUPLE String_pair; @@ -16,7 +29,7 @@ SYNTAX v2 { FLOAT name_wt = 0.0; FLOAT addr_wt = 0.0; - // Get weights of Last_Name and Address from the global Weight vertex + Wt = SELECT w FROM Weights:w POST-ACCUM IF do_last_name THEN @@ -24,14 +37,15 @@ SYNTAX v2 { IF do_address THEN addr_wt = w.wt_map.get("User_Address") END ; + + // last name Connected_users = SELECT A - // Find all linked users, plus each user's last name FROM User:A -(SameAs:e)- User:B, User:A -()- Last_Name:A_name, User:B -()- Last_Name:B_name WHERE A.id < B.id // filter so we don't count (A,B) & (B,A) ACCUM @@name_match += 1, - // If names aren't identical compute JaroWinkler * weight + IF do_last_name AND A_name.val != B_name.val THEN FLOAT sim = jaroWinklerDistance(A_name.id,B_name.id) * name_wt, @@sim_score += (A -> (B -> sim)), @@ -39,14 +53,15 @@ SYNTAX v2 { IF sim != 0 THEN @@name_update += 1 END END ; + + + // address Connected_users = SELECT A - // Find all linked users, plus each user's address FROM Connected_users:A -(SameAs:e)- User:B, User:A -()- Address:A_addr, User:B -()- Address:B_addr WHERE A.id < B.id // filter so we don't count (A,B) & (B,A) ACCUM @@addr_match += 1, - // If addresses aren't identical compute JaroWinkler * weight IF do_address AND A_addr.val != B_addr.val THEN FLOAT sim = jaroWinklerDistance(A_addr.id,B_addr.id) * addr_wt, @@sim_score += (A -> (B -> sim)), @@ -54,7 +69,6 @@ SYNTAX v2 { IF sim != 0 THEN @@addr_update += 1 END END ; - // Add the new similarity scores to the existing scores IF NOT print_only THEN Connected_users = SELECT A @@ -69,4 +83,4 @@ SYNTAX v2 { PRINT @@sim_score.size() AS num_scores; PRINT @@string_pairs; PRINT @@sim_score; -} \ No newline at end of file +} diff --git a/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/util_count_vertices.gsql b/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/util_count_vertices.gsql index ea03064..1a17f1e 100644 --- a/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/util_count_vertices.gsql +++ b/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/util_count_vertices.gsql @@ -1,8 +1,17 @@ CREATE QUERY util_count_vertices(STRING v_type="User") FOR GRAPH Entity_Resolution SYNTAX V2 { +/* + Counts vertices given type + + No Inputs + + Start from vertex set v_type: + (1) Count the number of vertices in v_type + +*/ SumAccum @@v_count; Source = {v_type}; H = SELECT v FROM Source:v ACCUM @@v_count += 1; PRINT @@v_count; -} \ No newline at end of file +} diff --git a/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/util_delete_users.gsql b/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/util_delete_users.gsql index f755d9e..fc99ee3 100644 --- a/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/util_delete_users.gsql +++ b/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/util_delete_users.gsql @@ -1,4 +1,15 @@ CREATE QUERY util_delete_users(bool are_you_sure=FALSE) FOR GRAPH Entity_Resolution SYNTAX V2 { + +/* + Deletes all users + + No inputs + + If are_you_sure is True: + (1) Selects and deletes all Users + (2) Prints action taken + +*/ IF are_you_sure THEN All_users = {User.*}; @@ -8,4 +19,4 @@ CREATE QUERY util_delete_users(bool are_you_sure=FALSE) FOR GRAPH Entity_Resolut ELSE PRINT "No action taken" AS endMsg; END; -} \ No newline at end of file +} diff --git a/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/util_print_vertices.gsql b/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/util_print_vertices.gsql index 96023a8..d26c0d4 100644 --- a/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/util_print_vertices.gsql +++ b/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/util_print_vertices.gsql @@ -1,6 +1,15 @@ CREATE QUERY util_print_vertices (STRING v_type="Weights") FOR GRAPH Entity_Resolution SYNTAX v2 { + +/* + Get all vertices of given type + + No inputs + + Selects all vertices of given type and prints the set + +*/ Vertices = {v_type}; // Get all vertices of type v_typpe PRINT Vertices; -} \ No newline at end of file +} diff --git a/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/util_set_weights.gsql b/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/util_set_weights.gsql index 6435b92..7011ba4 100644 --- a/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/util_set_weights.gsql +++ b/In-Database-Machine-Learning-for-Big-Data-Entity-Resolution/db_scripts/queries/util_set_weights.gsql @@ -3,6 +3,15 @@ CREATE QUERY util_set_weights( DOUBLE last_name_wt=0.75, DOUBLE address_wt=0.5, DOUBLE device_wt=0.5) FOR GRAPH Entity_Resolution SYNTAX v2{ + /* + Sets all weights + + No inputs + + Sets weights from default parameters and stores in weight vertex + + */ + MapAccum @@wt_map; @@wt_map += ("User_IP" -> ip_wt); @@wt_map += ("User_Email" -> email_wt); @@ -15,4 +24,4 @@ CREATE QUERY util_set_weights( POST-ACCUM w.wt_map = @@wt_map; PRINT "init_weight_vertex: Done" AS endMsg; -} \ No newline at end of file +} diff --git a/Low-Rank-Approximation-Machine-Learning/db_scripts/queries/README.gsql b/Low-Rank-Approximation-Machine-Learning/db_scripts/queries/README.gsql index 2b0b10b..459bcf1 100644 --- a/Low-Rank-Approximation-Machine-Learning/db_scripts/queries/README.gsql +++ b/Low-Rank-Approximation-Machine-Learning/db_scripts/queries/README.gsql @@ -1,17 +1,27 @@ CREATE QUERY README(/* Parameters here */) FOR GRAPH LowRankApproximation SYNTAX V2 { /* -The LowRankApproximation Starter Kit factorize a loaded sparse matrix A into two low-rank matrices U and V such that the matrix product of U and the transpose of V can approximate the original sparse matrix A. The U and V are obtained by minimize the Frobenius norm (or the root mean squer error) of A - U*transpose(V) using gradient descent algorithm. -To compute U and V, run the queries below in sequence - 1. initialization - 2. factorization + The LowRankApproximation Starter Kit factorize a loaded sparse matrix A into + two low-rank matrices U and V such that the matrix product of U and the transpose + of V can approximate the original sparse matrix A. The U and V are obtained by + minimize the Frobenius norm (or the root mean squer error) of + A - U*transpose(V) using gradient descent algorithm. +*/ + +STRING graph_name = "Low-Rank-Approximation-Machine-Learning"; +STRING graph_description = "Implements the low-rank approximation algorithm natively " + + "in-database to deliver personalized recommendations."; +STRING query_order = "1. initialization, 2. factorization, 3. compare_approximation, 4. print_result"; -To compare the approximated matrix with the original matrix, run the queries below -compare_approximation +STRING initialization = "Initializes the row vectors for the matrix U and the matrix V " + + "where elements are initialized by a normal distributed random number generator."; +STRING factorization = "Factorizes the loaded sparse matrix into two low-rank matrices " + + "U and V using gradient descent algorithm." -To print out the matrices U and V, run print_result +//To compare the approximated matrix with the original matrix +STRING compare_approximation = "Outputs the element values of one row of the loaded matrix " + + "specified by the inputed row index together with the approximated element values."; +STRING print_result = "Prints out two matrices U and V factorized from the loaded sparse matrix."; -The order of the queries need to be followed to ensure the correctness of the result. -*/ - - PRINT "README works!"; -} \ No newline at end of file +PRINT name, graph_description, query_order, initialization, factorization, compare_approximation, print_result; + +} diff --git a/Low-Rank-Approximation-Machine-Learning/db_scripts/queries/compare_approximation.gsql b/Low-Rank-Approximation-Machine-Learning/db_scripts/queries/compare_approximation.gsql index 2a5eeb3..3f16ebd 100644 --- a/Low-Rank-Approximation-Machine-Learning/db_scripts/queries/compare_approximation.gsql +++ b/Low-Rank-Approximation-Machine-Learning/db_scripts/queries/compare_approximation.gsql @@ -1,4 +1,19 @@ CREATE QUERY compare_approximation(VERTEX row_index) FOR GRAPH LowRankApproximation SYNTAX V2 { +/* + Outputs the element values of one row of the loaded matrix specified by the + inputed row index together with the approximated element + + Sample inputs: + row_index: + + Starting with a row index: + (1) Get columns of elements in row + (2) Accum to approximated values and real values + t.@approximated_value += dotProduct_List_List(s.u,t.v), + t.@real_value += e.element_value + (3) Outputs all the existing element values in the given row +*/ + // This query output the element values of one row of the loaded matrix specified by the inputed row index together with the approximated element values // The query input is a row index // The query output is all the existing element values in the given row. The column index is shown as the v_id of the MATRIX_COLUMN vertex. @@ -13,4 +28,4 @@ CREATE QUERY compare_approximation(VERTEX row_index) FOR GRAPH LowRa ORDER BY str_to_int(t.column_index) ASC; PRINT MATRIX[MATRIX.@real_value,MATRIX.@approximated_value]; -} \ No newline at end of file +} diff --git a/Low-Rank-Approximation-Machine-Learning/db_scripts/queries/factorization.gsql b/Low-Rank-Approximation-Machine-Learning/db_scripts/queries/factorization.gsql index 99a2ca3..17a39f3 100644 --- a/Low-Rank-Approximation-Machine-Learning/db_scripts/queries/factorization.gsql +++ b/Low-Rank-Approximation-Machine-Learning/db_scripts/queries/factorization.gsql @@ -1,4 +1,16 @@ CREATE QUERY factorization(DOUBLE learning_rate = 0.001, DOUBLE regularization_factor = 0.00005, INT Iter=30) FOR GRAPH LowRankApproximation SYNTAX V2 { + /* + Factorizes the loaded sparse matrix into two low-rank matrices U and V using + gradient descent algorithm. + + No inputs + + Starting with all matrix rows: + (1) Pass u and v to local accum + (2) Obtain the row vectors using gradient descent algorithm + (3) Calculate RMSE + */ + // This query factorize the loaded sparse matrix into two low-rank matrices U and V using gradient descent algorithm // The length of row vectors is set as 19. This number has to be the same as the len_of_rowVector in the initialization query // The query inputs are the learning rate, regularization_factor and the number of iterations @@ -82,4 +94,4 @@ CREATE QUERY factorization(DOUBLE learning_rate = 0.001, DOUBLE regularization_f END, s.v = s.@tmp; -} \ No newline at end of file +} diff --git a/Low-Rank-Approximation-Machine-Learning/db_scripts/queries/initialization.gsql b/Low-Rank-Approximation-Machine-Learning/db_scripts/queries/initialization.gsql index 9b0d3e0..ca97ade 100644 --- a/Low-Rank-Approximation-Machine-Learning/db_scripts/queries/initialization.gsql +++ b/Low-Rank-Approximation-Machine-Learning/db_scripts/queries/initialization.gsql @@ -1,4 +1,17 @@ CREATE QUERY initialization(float sdv = 0.1, float mean = 0.1) FOR GRAPH LowRankApproximation SYNTAX V2 { + /* + Initializes the row vectors for the matrix U and the matrix V where elements + are initialized by a normal distributed random number generator. + + No inputs + + Starting with all matrix rows + (1) Assign the random number to the row vectors of the matrix U + + Starting with all matrix columns + (2) Assign the random number to the row vectors of the matrix V + */ + // This query initialize the row vectors for the matrix U and the matrix V // The elements in the row vectors are initialized by a normal distributed random number generator // The query inputs are the standard deviation and the mean of the normal distribution @@ -24,4 +37,4 @@ CREATE QUERY initialization(float sdv = 0.1, float mean = 0.1) FOR GRAPH LowRank END POST-ACCUM s.v = s.@init; -} \ No newline at end of file +} diff --git a/Low-Rank-Approximation-Machine-Learning/db_scripts/queries/print_result.gsql b/Low-Rank-Approximation-Machine-Learning/db_scripts/queries/print_result.gsql index 29c6274..d782118 100644 --- a/Low-Rank-Approximation-Machine-Learning/db_scripts/queries/print_result.gsql +++ b/Low-Rank-Approximation-Machine-Learning/db_scripts/queries/print_result.gsql @@ -1,4 +1,18 @@ CREATE QUERY print_result(/* Parameters here */) FOR GRAPH LowRankApproximation SYNTAX V2 { + /* + Prints out two matrices U and V factorized from the loaded sparse matrix. + + No inputs + + Starting with all matrix rows: + (1) Order rows by row index ascending + and print matrix U + + Using all matrix columns: + (1) Order columns by column index ascending + and print matrix V + */ + // This query print out two matrices U and V factorized from the loaded sparse matrix. The row index of the U and V are shown as the v_id of MATRIX_ROW and MATRIX_COLUMN respectively. The row vectors of each row are shown as MATRIX_U.u and MATRIX_V.v. MATRIX_U = {MATRIX_ROW.*}; MATRIX_U = SELECT s FROM MATRIX_U:s @@ -9,4 +23,4 @@ CREATE QUERY print_result(/* Parameters here */) FOR GRAPH LowRankApproximation MATRIX_V = SELECT s FROM MATRIX_V:s ORDER BY str_to_int(s.column_index) ASC; PRINT MATRIX_V [MATRIX_V.v]; -} \ No newline at end of file +} diff --git a/Machine-Learning-and-Real-time-Fraud-Detection/db_scripts/queries/README.gsql b/Machine-Learning-and-Real-time-Fraud-Detection/db_scripts/queries/README.gsql new file mode 100644 index 0000000..e8fdbad --- /dev/null +++ b/Machine-Learning-and-Real-time-Fraud-Detection/db_scripts/queries/README.gsql @@ -0,0 +1,13 @@ +CREATE QUERY README(/* Parameters here */) FOR GRAPH sdmGraph SYNTAX V2 { + +STRING name = "Machine-Learning-and-Real-time-Fraud-Detection"; +STRING graph_description = "Mobile Industry example for detecting fraud in real-time " + + "and generating graph-based features for training the machine learning solution"; + +STRING query_order = "No order"; + +STRING feature_collection = "Returns stable connections given phone id, number of calls, and duration limit."; + +PRINT name, graph_description, query_order, feature_collection; + +} diff --git a/Machine-Learning-and-Real-time-Fraud-Detection/db_scripts/queries/feature_collection.gsql b/Machine-Learning-and-Real-time-Fraud-Detection/db_scripts/queries/feature_collection.gsql index 66206ce..5e6f174 100644 --- a/Machine-Learning-and-Real-time-Fraud-Detection/db_scripts/queries/feature_collection.gsql +++ b/Machine-Learning-and-Real-time-Fraud-Detection/db_scripts/queries/feature_collection.gsql @@ -1,4 +1,19 @@ CREATE query feature_collection(vertex phoneId, int durationLimit = 600, int numOfCallLimit = 10) for graph sdmGraph SYNTAX V2 { + /* + Returns stable connections given phone id, number of calls, and duration limit. + + Sample inputs: + phoneId: 1 | 2 + + Starting with a seed phoneID: + (1) Select phone ids with edges to the seed. + (2) Store stable connection edges and stable targets + (3) Update neighbor information among target group + (4) Count stable calls + (5) Return seed, target group, edges, and stable connection list + +*/ + TYPEDEF TUPLE CallInfo; SumAccum @stableCount; GroupByAccum tid, ListAccum callInfo> @NB_Info; @@ -65,4 +80,4 @@ CREATE query feature_collection(vertex phoneId, int durationLimit = 600, PRINT TargetGroup; PRINT @@target_Group_Edge_List; PRINT @@in_Group_Stable_Connection_List; -} \ No newline at end of file +} diff --git a/Network-and-IT-Resource-Optimization/db_scripts/queries/README.gsql b/Network-and-IT-Resource-Optimization/db_scripts/queries/README.gsql new file mode 100644 index 0000000..752a602 --- /dev/null +++ b/Network-and-IT-Resource-Optimization/db_scripts/queries/README.gsql @@ -0,0 +1,14 @@ +CREATE QUERY README() FOR GRAPH Storage { + STRING graph_description = "Network and IT resource graph for modeling and analyzing + the impact of the hardware outage on workloads."; + + STRING query_order = "no order"; + STRING app_impact = "Detect the top k applications which have the most impact on a given + application."; + STRING storage_impact = "Detect edges that differ from the input type where their goUpper + storage attribute is true."; + STRING warning_impact = "Returns Alert_App, App_Service, and Service Manager edges related + to a warning vertex"; + + PRINT graph_description, query_order_query_name; +} diff --git a/Network-and-IT-Resource-Optimization/db_scripts/queries/app_impact.gsql b/Network-and-IT-Resource-Optimization/db_scripts/queries/app_impact.gsql index 344a983..9764951 100644 --- a/Network-and-IT-Resource-Optimization/db_scripts/queries/app_impact.gsql +++ b/Network-and-IT-Resource-Optimization/db_scripts/queries/app_impact.gsql @@ -1,4 +1,20 @@ CREATE QUERY app_impact(vertex a, float decay, int k) FOR GRAPH Storage SYNTAX V2 { +/* + + Detect the top k applications which have the most impact on a given + application. + + Sample inputs: + a: 9998 | 9649 | 5679 + decay: 0.5 | 0.8 | 1 + k: 5 | 4 | 10 + + Starting from an input application, + (1) Find all applications connected to the input application with + an AppCall edge and calculate their impact score and edge set. + (2) Display the resulting applications ordered by impact score. + +*/ int iteration = 0; @@ -19,9 +35,9 @@ CREATE QUERY app_impact(vertex a, float decay, int k) FOR GRAPH Sto result = result UNION start; end; - Result = SELECT s from result:s + final = SELECT s from result:s ORDER BY s.@impact_score DESC LIMIT k; - print Result; -} \ No newline at end of file + print final; +} diff --git a/Network-and-IT-Resource-Optimization/db_scripts/queries/storage_impact.gsql b/Network-and-IT-Resource-Optimization/db_scripts/queries/storage_impact.gsql index 48b11ac..498ad8d 100644 --- a/Network-and-IT-Resource-Optimization/db_scripts/queries/storage_impact.gsql +++ b/Network-and-IT-Resource-Optimization/db_scripts/queries/storage_impact.gsql @@ -1,4 +1,21 @@ CREATE QUERY storage_impact(string vertexType, vertex input) FOR GRAPH Storage SYNTAX V2 { + +/* + Detect edges that differ from the input type where their goUpper + storage attribute is true. + + Sample inputs: + vertexType: Application + input: + + Starting from an vertex input, + (1) Find all the vertices with an edge to the input where the + goUpper attribute for the edge is true + (2) Add the edges to a global sum accumulator + (3) Return the edge list + +*/ + OrAccum @@stop; @@ -16,4 +33,4 @@ CREATE QUERY storage_impact(string vertexType, vertex input) FOR GRAPH Storage S end; PRINT @@edgeList; -} \ No newline at end of file +} diff --git a/Network-and-IT-Resource-Optimization/db_scripts/queries/warning_impact.gsql b/Network-and-IT-Resource-Optimization/db_scripts/queries/warning_impact.gsql index 9280975..7e3663b 100644 --- a/Network-and-IT-Resource-Optimization/db_scripts/queries/warning_impact.gsql +++ b/Network-and-IT-Resource-Optimization/db_scripts/queries/warning_impact.gsql @@ -1,4 +1,19 @@ CREATE QUERY warning_impact(vertex inputWarn) FOR GRAPH Storage SYNTAX V2 { +/* + Returns Alert_App, App_Service, and Service Manager edges related + to a warning vertex. + + Sample inputs: + inputWarn: 2000821 + + Starting from an "inputWarn", + (1) Select all applications connected to the warning and add to + edge list + (2) Select all services related to the applications from part (1) + (3) Select all managers related to the services from part (2) + +*/ + ListAccum @@edge_List; @@ -15,4 +30,4 @@ CREATE QUERY warning_impact(vertex inputWarn) FOR GRAPH Storage SYNTAX PRINT Man; PRINT @@edge_List; -} \ No newline at end of file +}