Merge pull request #13 from jjmccollum/12-prepare-for-open-cbgm-v2

12 prepare for open cbgm v2
jjmccollum · Feb 3, 2025 · 6680ed8 · 6680ed8
2 parents 0f86324 + 0bba94f
commit 6680ed8
Show file tree

Hide file tree

Showing 11 changed files with 254 additions and 99 deletions.
diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml
@@ -2,11 +2,7 @@
 # See: https://github.com/actions/starter-workflows/blob/main/ci/cmake-single-platform.yml
 name: testing
 
-on:
-  push:
-    branches: [ "master" ]
-  pull_request:
-    branches: [ "master" ]
+on: [push, pull_request]
 
 jobs:
   build:

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 3.4)
 # Set the project name and version:
 project(open-cbgm 
-	VERSION 1.7.0
+	VERSION 2.0.0
 	DESCRIPTION "Fast, compact, open-source, TEI-compliant C++ implementation of the Coherence-Based Genealogical Method"
 	LANGUAGES C CXX)
 

diff --git a/README.md b/README.md
@@ -1,13 +1,16 @@
 # open-cbgm
-Fast, compact, open-source, TEI-compliant C++ implementation of the Coherence-Based Genealogical Method
 
-[![Version 1.7.0](https://img.shields.io/badge/version-1.7.0-blue)](https://github.com/jjmccollum/open-cbgm)
+![open-cbgm logo](https://github.com/jjmccollum/open-cbgm/blob/master/img/open-cbgm-logo.png)
+
+[![Version 2.0.0](https://img.shields.io/badge/version-2.0.0-blue)](https://github.com/jjmccollum/open-cbgm)
 [![Build Status](https://github.com/jjmccollum/open-cbgm/actions/workflows/testing.yml/badge.svg)](https://github.com/jjmccollum/open-cbgm/actions/workflows/testing.yml)
 [![MIT License](https://img.shields.io/badge/license-MIT-blue.svg?style=flat)](https://choosealicense.com/licenses/mit/)
 [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.4048498.svg)](https://doi.org/10.5281/zenodo.4048498)
 
 ## About This Project
 
+Fast, compact, open-source, TEI-compliant C++ implementation of the Coherence-Based Genealogical Method
+
 ### Introduction
 
 The Coherence-Based Genealogical Method (CBGM) is a novel approach to textual criticism, popularized by the Institut für Neutestamentliche Textforschung (INTF) for its use in the production of the _Editio Critica Maior_ (_ECM_) of the New Testament. It is a meta-method, combining methodology-dependent philological decisions from the user with efficient computer-based calculations to highlight genealogical relationships between different stages of the text. To establish genealogical relationships in the presence of contamination (understood to be a problem in the textual tradition of the New Testament), the CBGM makes a number of philosophical and methodological innovations, such as the abstracting of texts away from the manuscripts that preserve them (and the resulting rejection of hypothetical ancestors as used in traditional stemmata), the encoding of the textual critic's decisions in local stemmata of variants, and the use of coherence in textual flow to evaluate hypotheses about the priority of variant readings. 

diff --git a/img/open-cbgm-logo.png b/img/open-cbgm-logo.png
diff --git a/img/open-cbgm-logo.svg b/img/open-cbgm-logo.svg
diff --git a/include/set_cover_solver.h b/include/set_cover_solver.h
@@ -62,12 +62,12 @@ class set_cover_solver {
 	roaring::Roaring get_unique_rows() const;
 	bool is_feasible(const roaring::Roaring & solution_rows) const;
 	void remove_redundant_rows_from_solution(roaring::Roaring & initial_solution_rows) const;
-	set_cover_solution get_trivial_solution() const;
-	set_cover_solution get_greedy_solution() const;
+	roaring::Roaring get_greedy_solution() const;
 	void branch(const roaring::Roaring & remaining, std::stack<branch_and_bound_node> & nodes);
 	float bound(const roaring::Roaring & solution_rows) const;
 	void branch_and_bound(std::list<set_cover_solution> & solutions);
-	void solve(std::list<set_cover_solution> & solutions);
+	void branch_and_bound_single_solution(std::list<set_cover_solution> & solutions);
+	void solve(std::list<set_cover_solution> & solutions, bool single_solution=false);
 };
 
 #endif /* SET_COVER_SOLVER_H */
diff --git a/include/witness.h b/include/witness.h
@@ -47,7 +47,7 @@ class witness {
 	std::unordered_map<std::string, genealogical_comparison> get_genealogical_comparisons() const;
 	genealogical_comparison get_genealogical_comparison_for_witness(const std::string & other_id) const;
 	std::list<std::string> get_potential_ancestor_ids() const;
-	std::list<set_cover_solution> get_substemmata(float ub=0) const;
+	std::list<set_cover_solution> get_substemmata(float ub=0, bool single_solution=false) const;
 	void set_stemmatic_ancestor_ids(const std::list<std::string> & witnesses);
 	std::list<std::string> get_stemmatic_ancestor_ids() const;
 };

diff --git a/src/set_cover_solver.cpp b/src/set_cover_solver.cpp
@@ -185,31 +185,9 @@ void set_cover_solver::remove_redundant_rows_from_solution(Roaring & solution_ro
 }
 
 /**
- * Returns a trivial set cover solution consisting of the lowest-cost row that covers the target columns.
- * If the current witness has the Ausgangstext as a potential ancestor (which should hold for all non-fragmentary witnesses)
- * and the Ausgangstext explains all other readings
- * (i.e., if all local stemmata are connected, which is necessary for the global stemma to be connected),
- * then at least one such solution is guaranteed to exist.
+ * Returns the bitmap representing the set cover solution found by the basic greedy heuristic.
  */
-set_cover_solution set_cover_solver::get_trivial_solution() const {
-	set_cover_solution trivial_solution;
-	trivial_solution.rows = list<set_cover_row>();
-	trivial_solution.agreements = 0;
-	trivial_solution.cost = numeric_limits<float>::infinity();
-	for (set_cover_row row : rows) {
-		if (target.isSubset(row.explained) && row.cost < trivial_solution.cost) {
-			trivial_solution.rows = list<set_cover_row>({row});
-			trivial_solution.agreements = (int) row.agreements.cardinality();
-			trivial_solution.cost = row.cost;
-		}
-	}
-	return trivial_solution;
-}
-
-/**
- * Returns the set cover solution found by the basic greedy heuristic.
- */
-set_cover_solution set_cover_solver::get_greedy_solution() const {
+Roaring set_cover_solver::get_greedy_solution() const {
 	Roaring greedy_solution_rows = Roaring();
 	Roaring uncovered = Roaring(target);
 	//Until the target is completely covered, choose the row with the lowest cost-to-coverage proportion:
@@ -239,8 +217,7 @@ set_cover_solution set_cover_solver::get_greedy_solution() const {
 	}
 	//Now remove any redundant columns from this solution:
 	remove_redundant_rows_from_solution(greedy_solution_rows);
-	set_cover_solution greedy_solution = get_solution_from_rows(greedy_solution_rows);
-	return greedy_solution;
+	return greedy_solution_rows;
 }
 
 /**
@@ -290,13 +267,12 @@ void set_cover_solver::branch_and_bound(list<set_cover_solution> & solutions) {
 	remaining.addRange(0, rows.size());
 	//Initialize a stack of branch-and-bound nodes:
 	stack<branch_and_bound_node> nodes = stack<branch_and_bound_node>();
-	//If no fixed upper bound is specified, then obtain a good initial upper bound quickly using the trivial solution and the greedy solution:
+	//If no fixed upper bound is specified, then obtain a good initial upper bound quickly using the greedy solution:
 	float ub = fixed_ub;
 	bool is_ub_fixed = fixed_ub < numeric_limits<float>::infinity();
 	if (!is_ub_fixed) {
-		set_cover_solution trivial_solution = get_trivial_solution();
-		set_cover_solution greedy_solution = get_greedy_solution();
-		ub = min(trivial_solution.cost, greedy_solution.cost);
+		Roaring greedy_solution_rows = get_greedy_solution();
+		ub = bound(greedy_solution_rows);
 	}
 	//Initialize the stack of branch and bound nodes with the first node:
 	branch(remaining, nodes);
@@ -369,12 +345,93 @@ void set_cover_solver::branch_and_bound(list<set_cover_solution> & solutions) {
 	return;
 }
 
+/**
+ * Populates a list of set cover solutions via branch and bound, under the assumption that only a single lowest-cost solution is needed.
+ * Any fixed upper bound for the solver will be ignored.
+ * This is an optimization intended to be used for global stemma construction, where only one solution is used even if there are multiple of equal cost.
+ */
+void set_cover_solver::branch_and_bound_single_solution(list<set_cover_solution> & solutions) {
+	//Initialize a map of solution row set bitmaps, keyed by their serializations:
+	unordered_map<string, Roaring> distinct_row_sets = unordered_map<string, Roaring>();
+	//Initialize bitmaps representing rows included in the current solution and rows to be processed:
+	Roaring accepted = Roaring();
+	Roaring remaining = Roaring();
+	remaining.addRange(0, rows.size());
+	//Initialize a stack of branch-and-bound nodes:
+	stack<branch_and_bound_node> nodes = stack<branch_and_bound_node>();
+	//Obtain a good initial upper bound quickly using the greedy solution:
+	float ub = numeric_limits<float>::infinity();
+	Roaring greedy_solution_rows = get_greedy_solution();
+	ub = bound(greedy_solution_rows);
+	//Add the solution row bitmap to the solution set:
+	string serialized = greedy_solution_rows.toString();
+	distinct_row_sets[serialized] = greedy_solution_rows;
+	//Initialize the stack of branch and bound nodes with the first node:
+	branch(remaining, nodes);
+	//Then continue with branch and bound until there is nothing left to be processed:
+	while (!nodes.empty()) {
+		//Get the current node from the stack:
+		branch_and_bound_node & node = nodes.top();
+		//Adjust the set partitions to reflect the candidate solution representing by the current node:
+		unsigned int row = node.row;
+		if (node.state == node_state::ACCEPT) {
+			//Add the candidate row to the solution:
+			remaining.remove(row);
+			accepted.add(row);
+			//Update its state:
+			node.state = node_state::REJECT;
+		}
+		else if (node.state == node_state::REJECT) {
+			//Exclude the candidate row from the solution:
+			accepted.remove(row);
+			//Update its state:
+			node.state = node_state::DONE;
+		}
+		else {
+			//We're done processing this node, and we can add its row back to the set of available rows:
+			remaining.add(row);
+			nodes.pop();
+			continue;
+		}
+		//Check if current set of accepted rows represents a feasible solution:
+		if (is_feasible(accepted)) {
+			//If it does, then calculate the cost of the solution:
+			Roaring solution_rows = Roaring(accepted);
+			//Remove redundant rows:
+			remove_redundant_rows_from_solution(solution_rows);
+			float cost = bound(solution_rows);
+			//Check if this cost is strictly below the current upper bound:
+			if (cost < ub) {
+				//If it is, then update the upper bound and solution set:
+				ub = cost;
+				distinct_row_sets = unordered_map<string, Roaring>();
+				//Then add the solution row bitmap to the solution set:
+				string serialized = solution_rows.toString();
+				distinct_row_sets[serialized] = solution_rows;
+			}
+		}
+	}
+	//For each distinct set of solution rows, add a set cover solution data structure to the solutions list:
+	for (pair<string, Roaring> kv : distinct_row_sets) {
+		Roaring solution_rows = kv.second;
+		set_cover_solution solution = get_solution_from_rows(solution_rows);
+		solutions.push_back(solution);
+	}
+	return;
+}
+
 /**
  * Populates the given solution list with solutions to the set cover problem.
  * If the set cover solver was constructed with a fixed upper bound, then this method will enumerate all solutions with costs within that bound.
+ * If the flag for single solutions is set (which should happen for the construction of the global stemma), 
+ * then the fixed upper bound is ignored, and a slightly more optimized version of the branch and bound procedure is used.
  */
-void set_cover_solver::solve(list<set_cover_solution> & solutions) {
+void set_cover_solver::solve(list<set_cover_solution> & solutions, bool single_solution) {
 	solutions = list<set_cover_solution>();
+	//If the single solution flag is set, the set the fixed upper bound to infinity:
+	if (single_solution) {
+		fixed_ub = std::numeric_limits<float>::infinity();
+	}
 	//Create a map of row IDs to their indices:
 	unordered_map<string, unsigned int> row_ids_to_inds = unordered_map<string, unsigned int>();
 	unsigned int row_ind = 0;
@@ -432,7 +489,11 @@ void set_cover_solver::solve(list<set_cover_solution> & solutions) {
 	}
 	list<set_cover_solution> subproblem_solutions = list<set_cover_solution>();
 	set_cover_solver subproblem_solver = fixed_ub != numeric_limits<float>::infinity() ? set_cover_solver(subproblem_rows, subproblem_target, subproblem_ub) : set_cover_solver(subproblem_rows, subproblem_target);
-	subproblem_solver.branch_and_bound(subproblem_solutions);
+	if (single_solution) {
+		subproblem_solver.branch_and_bound_single_solution(subproblem_solutions);
+	} else {
+		subproblem_solver.branch_and_bound(subproblem_solutions);
+	}
 	//Then add the unique coverage rows found earlier to the subproblem solutions:
 	set_cover_solution unique_rows_solution = get_solution_from_rows(unique_rows);
 	for (set_cover_solution subproblem_solution : subproblem_solutions) {

diff --git a/src/witness.cpp b/src/witness.cpp
@@ -216,8 +216,10 @@ list<string> witness::get_potential_ancestor_ids() const {
  * Returns a list of all minimum-cost substemmata for this witness.
  * Optionally, an upper bound on substemma cost can be specified,
  * in which case all substemmata within that cost bound will be returned.
+ * A boolean flag indicating whether a single solution is desired can also be specified,
+ * in which case the cost bound will be ignored and an optimized version of the branch-and-bound procedure will be used.
  */
- list<set_cover_solution> witness::get_substemmata(float ub) const {
+ list<set_cover_solution> witness::get_substemmata(float ub, bool single_solution) const {
 	list<set_cover_solution> substemmata = list<set_cover_solution>();
 	//Populate a vector of set cover rows using genealogical comparisons with this witness's potential ancestors:
 	vector<set_cover_row> rows = vector<set_cover_row>();
@@ -237,8 +239,8 @@ list<string> witness::get_potential_ancestor_ids() const {
 	//Initialize the bitmap of the target set to be covered:
 	Roaring target = genealogical_comparisons.at(id).extant;
 	//Then populate the rows of this table using the solver:
-	set_cover_solver solver = ub > 0 ? set_cover_solver(rows, target, ub) : set_cover_solver(rows, target);
-	solver.solve(substemmata);
+	set_cover_solver solver = (ub > 0 && !single_solution) ? set_cover_solver(rows, target, ub) : set_cover_solver(rows, target);
+	solver.solve(substemmata, single_solution);
 	return substemmata;
  }
 

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
@@ -20,14 +20,14 @@ add_test(NAME apparatus_constructor COMMAND autotest -t apparatus_constructor)
 add_test(NAME apparatus_get_extant_passages_for_witness COMMAND autotest -t apparatus_get_extant_passages_for_witness)
 add_test(NAME set_cover_solver_constructor COMMAND autotest -t set_cover_solver_constructor)
 add_test(NAME set_cover_solver_get_unique_rows COMMAND autotest -t set_cover_solver_get_unique_rows)
-add_test(NAME set_cover_solver_get_trivial_solution COMMAND autotest -t set_cover_solver_get_trivial_solution)
 add_test(NAME set_cover_solver_get_greedy_solution COMMAND autotest -t set_cover_solver_get_greedy_solution)
 add_test(NAME witness_constructor_1 COMMAND autotest -t witness_constructor_1)
 add_test(NAME witness_constructor_2 COMMAND autotest -t witness_constructor_2)
 add_test(NAME witness_get_genealogical_comparison_for_witness_1 COMMAND autotest -t witness_get_genealogical_comparison_for_witness_1)
 add_test(NAME witness_get_genealogical_comparison_for_witness_2 COMMAND autotest -t witness_get_genealogical_comparison_for_witness_2)
 add_test(NAME witness_get_genealogical_comparison_for_witness_3 COMMAND autotest -t witness_get_genealogical_comparison_for_witness_3)
 add_test(NAME witness_get_substemmata COMMAND autotest -t witness_get_substemmata)
+add_test(NAME witness_get_substemmata_single_solution COMMAND autotest -t witness_get_substemmata_single_solution)
 add_test(NAME textual_flow_constructor_1 COMMAND autotest -t textual_flow_constructor_1)
 add_test(NAME textual_flow_constructor_2 COMMAND autotest -t textual_flow_constructor_2)
 add_test(NAME textual_flow_textual_flow_to_dot COMMAND autotest -t textual_flow_textual_flow_to_dot)