Skip to content

Commit

Permalink
Merge pull request #13 from jjmccollum/12-prepare-for-open-cbgm-v2
Browse files Browse the repository at this point in the history
12 prepare for open cbgm v2
  • Loading branch information
jjmccollum authored Feb 3, 2025
2 parents 0f86324 + 0bba94f commit 6680ed8
Show file tree
Hide file tree
Showing 11 changed files with 254 additions and 99 deletions.
6 changes: 1 addition & 5 deletions .github/workflows/testing.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,7 @@
# See: https://github.com/actions/starter-workflows/blob/main/ci/cmake-single-platform.yml
name: testing

on:
push:
branches: [ "master" ]
pull_request:
branches: [ "master" ]
on: [push, pull_request]

jobs:
build:
Expand Down
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
cmake_minimum_required(VERSION 3.4)
# Set the project name and version:
project(open-cbgm
VERSION 1.7.0
VERSION 2.0.0
DESCRIPTION "Fast, compact, open-source, TEI-compliant C++ implementation of the Coherence-Based Genealogical Method"
LANGUAGES C CXX)

Expand Down
7 changes: 5 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
# open-cbgm
Fast, compact, open-source, TEI-compliant C++ implementation of the Coherence-Based Genealogical Method

[![Version 1.7.0](https://img.shields.io/badge/version-1.7.0-blue)](https://github.com/jjmccollum/open-cbgm)
![open-cbgm logo](https://github.com/jjmccollum/open-cbgm/blob/master/img/open-cbgm-logo.png)

[![Version 2.0.0](https://img.shields.io/badge/version-2.0.0-blue)](https://github.com/jjmccollum/open-cbgm)
[![Build Status](https://github.com/jjmccollum/open-cbgm/actions/workflows/testing.yml/badge.svg)](https://github.com/jjmccollum/open-cbgm/actions/workflows/testing.yml)
[![MIT License](https://img.shields.io/badge/license-MIT-blue.svg?style=flat)](https://choosealicense.com/licenses/mit/)
[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.4048498.svg)](https://doi.org/10.5281/zenodo.4048498)

## About This Project

Fast, compact, open-source, TEI-compliant C++ implementation of the Coherence-Based Genealogical Method

### Introduction

The Coherence-Based Genealogical Method (CBGM) is a novel approach to textual criticism, popularized by the Institut für Neutestamentliche Textforschung (INTF) for its use in the production of the _Editio Critica Maior_ (_ECM_) of the New Testament. It is a meta-method, combining methodology-dependent philological decisions from the user with efficient computer-based calculations to highlight genealogical relationships between different stages of the text. To establish genealogical relationships in the presence of contamination (understood to be a problem in the textual tradition of the New Testament), the CBGM makes a number of philosophical and methodological innovations, such as the abstracting of texts away from the manuscripts that preserve them (and the resulting rejection of hypothetical ancestors as used in traditional stemmata), the encoding of the textual critic's decisions in local stemmata of variants, and the use of coherence in textual flow to evaluate hypotheses about the priority of variant readings.
Expand Down
Binary file added img/open-cbgm-logo.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
85 changes: 85 additions & 0 deletions img/open-cbgm-logo.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
6 changes: 3 additions & 3 deletions include/set_cover_solver.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,12 +62,12 @@ class set_cover_solver {
roaring::Roaring get_unique_rows() const;
bool is_feasible(const roaring::Roaring & solution_rows) const;
void remove_redundant_rows_from_solution(roaring::Roaring & initial_solution_rows) const;
set_cover_solution get_trivial_solution() const;
set_cover_solution get_greedy_solution() const;
roaring::Roaring get_greedy_solution() const;
void branch(const roaring::Roaring & remaining, std::stack<branch_and_bound_node> & nodes);
float bound(const roaring::Roaring & solution_rows) const;
void branch_and_bound(std::list<set_cover_solution> & solutions);
void solve(std::list<set_cover_solution> & solutions);
void branch_and_bound_single_solution(std::list<set_cover_solution> & solutions);
void solve(std::list<set_cover_solution> & solutions, bool single_solution=false);
};

#endif /* SET_COVER_SOLVER_H */
2 changes: 1 addition & 1 deletion include/witness.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ class witness {
std::unordered_map<std::string, genealogical_comparison> get_genealogical_comparisons() const;
genealogical_comparison get_genealogical_comparison_for_witness(const std::string & other_id) const;
std::list<std::string> get_potential_ancestor_ids() const;
std::list<set_cover_solution> get_substemmata(float ub=0) const;
std::list<set_cover_solution> get_substemmata(float ub=0, bool single_solution=false) const;
void set_stemmatic_ancestor_ids(const std::list<std::string> & witnesses);
std::list<std::string> get_stemmatic_ancestor_ids() const;
};
Expand Down
125 changes: 93 additions & 32 deletions src/set_cover_solver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -185,31 +185,9 @@ void set_cover_solver::remove_redundant_rows_from_solution(Roaring & solution_ro
}

/**
* Returns a trivial set cover solution consisting of the lowest-cost row that covers the target columns.
* If the current witness has the Ausgangstext as a potential ancestor (which should hold for all non-fragmentary witnesses)
* and the Ausgangstext explains all other readings
* (i.e., if all local stemmata are connected, which is necessary for the global stemma to be connected),
* then at least one such solution is guaranteed to exist.
* Returns the bitmap representing the set cover solution found by the basic greedy heuristic.
*/
set_cover_solution set_cover_solver::get_trivial_solution() const {
set_cover_solution trivial_solution;
trivial_solution.rows = list<set_cover_row>();
trivial_solution.agreements = 0;
trivial_solution.cost = numeric_limits<float>::infinity();
for (set_cover_row row : rows) {
if (target.isSubset(row.explained) && row.cost < trivial_solution.cost) {
trivial_solution.rows = list<set_cover_row>({row});
trivial_solution.agreements = (int) row.agreements.cardinality();
trivial_solution.cost = row.cost;
}
}
return trivial_solution;
}

/**
* Returns the set cover solution found by the basic greedy heuristic.
*/
set_cover_solution set_cover_solver::get_greedy_solution() const {
Roaring set_cover_solver::get_greedy_solution() const {
Roaring greedy_solution_rows = Roaring();
Roaring uncovered = Roaring(target);
//Until the target is completely covered, choose the row with the lowest cost-to-coverage proportion:
Expand Down Expand Up @@ -239,8 +217,7 @@ set_cover_solution set_cover_solver::get_greedy_solution() const {
}
//Now remove any redundant columns from this solution:
remove_redundant_rows_from_solution(greedy_solution_rows);
set_cover_solution greedy_solution = get_solution_from_rows(greedy_solution_rows);
return greedy_solution;
return greedy_solution_rows;
}

/**
Expand Down Expand Up @@ -290,13 +267,12 @@ void set_cover_solver::branch_and_bound(list<set_cover_solution> & solutions) {
remaining.addRange(0, rows.size());
//Initialize a stack of branch-and-bound nodes:
stack<branch_and_bound_node> nodes = stack<branch_and_bound_node>();
//If no fixed upper bound is specified, then obtain a good initial upper bound quickly using the trivial solution and the greedy solution:
//If no fixed upper bound is specified, then obtain a good initial upper bound quickly using the greedy solution:
float ub = fixed_ub;
bool is_ub_fixed = fixed_ub < numeric_limits<float>::infinity();
if (!is_ub_fixed) {
set_cover_solution trivial_solution = get_trivial_solution();
set_cover_solution greedy_solution = get_greedy_solution();
ub = min(trivial_solution.cost, greedy_solution.cost);
Roaring greedy_solution_rows = get_greedy_solution();
ub = bound(greedy_solution_rows);
}
//Initialize the stack of branch and bound nodes with the first node:
branch(remaining, nodes);
Expand Down Expand Up @@ -369,12 +345,93 @@ void set_cover_solver::branch_and_bound(list<set_cover_solution> & solutions) {
return;
}

/**
* Populates a list of set cover solutions via branch and bound, under the assumption that only a single lowest-cost solution is needed.
* Any fixed upper bound for the solver will be ignored.
* This is an optimization intended to be used for global stemma construction, where only one solution is used even if there are multiple of equal cost.
*/
void set_cover_solver::branch_and_bound_single_solution(list<set_cover_solution> & solutions) {
//Initialize a map of solution row set bitmaps, keyed by their serializations:
unordered_map<string, Roaring> distinct_row_sets = unordered_map<string, Roaring>();
//Initialize bitmaps representing rows included in the current solution and rows to be processed:
Roaring accepted = Roaring();
Roaring remaining = Roaring();
remaining.addRange(0, rows.size());
//Initialize a stack of branch-and-bound nodes:
stack<branch_and_bound_node> nodes = stack<branch_and_bound_node>();
//Obtain a good initial upper bound quickly using the greedy solution:
float ub = numeric_limits<float>::infinity();
Roaring greedy_solution_rows = get_greedy_solution();
ub = bound(greedy_solution_rows);
//Add the solution row bitmap to the solution set:
string serialized = greedy_solution_rows.toString();
distinct_row_sets[serialized] = greedy_solution_rows;
//Initialize the stack of branch and bound nodes with the first node:
branch(remaining, nodes);
//Then continue with branch and bound until there is nothing left to be processed:
while (!nodes.empty()) {
//Get the current node from the stack:
branch_and_bound_node & node = nodes.top();
//Adjust the set partitions to reflect the candidate solution representing by the current node:
unsigned int row = node.row;
if (node.state == node_state::ACCEPT) {
//Add the candidate row to the solution:
remaining.remove(row);
accepted.add(row);
//Update its state:
node.state = node_state::REJECT;
}
else if (node.state == node_state::REJECT) {
//Exclude the candidate row from the solution:
accepted.remove(row);
//Update its state:
node.state = node_state::DONE;
}
else {
//We're done processing this node, and we can add its row back to the set of available rows:
remaining.add(row);
nodes.pop();
continue;
}
//Check if current set of accepted rows represents a feasible solution:
if (is_feasible(accepted)) {
//If it does, then calculate the cost of the solution:
Roaring solution_rows = Roaring(accepted);
//Remove redundant rows:
remove_redundant_rows_from_solution(solution_rows);
float cost = bound(solution_rows);
//Check if this cost is strictly below the current upper bound:
if (cost < ub) {
//If it is, then update the upper bound and solution set:
ub = cost;
distinct_row_sets = unordered_map<string, Roaring>();
//Then add the solution row bitmap to the solution set:
string serialized = solution_rows.toString();
distinct_row_sets[serialized] = solution_rows;
}
}
}
//For each distinct set of solution rows, add a set cover solution data structure to the solutions list:
for (pair<string, Roaring> kv : distinct_row_sets) {
Roaring solution_rows = kv.second;
set_cover_solution solution = get_solution_from_rows(solution_rows);
solutions.push_back(solution);
}
return;
}

/**
* Populates the given solution list with solutions to the set cover problem.
* If the set cover solver was constructed with a fixed upper bound, then this method will enumerate all solutions with costs within that bound.
* If the flag for single solutions is set (which should happen for the construction of the global stemma),
* then the fixed upper bound is ignored, and a slightly more optimized version of the branch and bound procedure is used.
*/
void set_cover_solver::solve(list<set_cover_solution> & solutions) {
void set_cover_solver::solve(list<set_cover_solution> & solutions, bool single_solution) {
solutions = list<set_cover_solution>();
//If the single solution flag is set, the set the fixed upper bound to infinity:
if (single_solution) {
fixed_ub = std::numeric_limits<float>::infinity();
}
//Create a map of row IDs to their indices:
unordered_map<string, unsigned int> row_ids_to_inds = unordered_map<string, unsigned int>();
unsigned int row_ind = 0;
Expand Down Expand Up @@ -432,7 +489,11 @@ void set_cover_solver::solve(list<set_cover_solution> & solutions) {
}
list<set_cover_solution> subproblem_solutions = list<set_cover_solution>();
set_cover_solver subproblem_solver = fixed_ub != numeric_limits<float>::infinity() ? set_cover_solver(subproblem_rows, subproblem_target, subproblem_ub) : set_cover_solver(subproblem_rows, subproblem_target);
subproblem_solver.branch_and_bound(subproblem_solutions);
if (single_solution) {
subproblem_solver.branch_and_bound_single_solution(subproblem_solutions);
} else {
subproblem_solver.branch_and_bound(subproblem_solutions);
}
//Then add the unique coverage rows found earlier to the subproblem solutions:
set_cover_solution unique_rows_solution = get_solution_from_rows(unique_rows);
for (set_cover_solution subproblem_solution : subproblem_solutions) {
Expand Down
8 changes: 5 additions & 3 deletions src/witness.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -216,8 +216,10 @@ list<string> witness::get_potential_ancestor_ids() const {
* Returns a list of all minimum-cost substemmata for this witness.
* Optionally, an upper bound on substemma cost can be specified,
* in which case all substemmata within that cost bound will be returned.
* A boolean flag indicating whether a single solution is desired can also be specified,
* in which case the cost bound will be ignored and an optimized version of the branch-and-bound procedure will be used.
*/
list<set_cover_solution> witness::get_substemmata(float ub) const {
list<set_cover_solution> witness::get_substemmata(float ub, bool single_solution) const {
list<set_cover_solution> substemmata = list<set_cover_solution>();
//Populate a vector of set cover rows using genealogical comparisons with this witness's potential ancestors:
vector<set_cover_row> rows = vector<set_cover_row>();
Expand All @@ -237,8 +239,8 @@ list<string> witness::get_potential_ancestor_ids() const {
//Initialize the bitmap of the target set to be covered:
Roaring target = genealogical_comparisons.at(id).extant;
//Then populate the rows of this table using the solver:
set_cover_solver solver = ub > 0 ? set_cover_solver(rows, target, ub) : set_cover_solver(rows, target);
solver.solve(substemmata);
set_cover_solver solver = (ub > 0 && !single_solution) ? set_cover_solver(rows, target, ub) : set_cover_solver(rows, target);
solver.solve(substemmata, single_solution);
return substemmata;
}

Expand Down
2 changes: 1 addition & 1 deletion test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,14 @@ add_test(NAME apparatus_constructor COMMAND autotest -t apparatus_constructor)
add_test(NAME apparatus_get_extant_passages_for_witness COMMAND autotest -t apparatus_get_extant_passages_for_witness)
add_test(NAME set_cover_solver_constructor COMMAND autotest -t set_cover_solver_constructor)
add_test(NAME set_cover_solver_get_unique_rows COMMAND autotest -t set_cover_solver_get_unique_rows)
add_test(NAME set_cover_solver_get_trivial_solution COMMAND autotest -t set_cover_solver_get_trivial_solution)
add_test(NAME set_cover_solver_get_greedy_solution COMMAND autotest -t set_cover_solver_get_greedy_solution)
add_test(NAME witness_constructor_1 COMMAND autotest -t witness_constructor_1)
add_test(NAME witness_constructor_2 COMMAND autotest -t witness_constructor_2)
add_test(NAME witness_get_genealogical_comparison_for_witness_1 COMMAND autotest -t witness_get_genealogical_comparison_for_witness_1)
add_test(NAME witness_get_genealogical_comparison_for_witness_2 COMMAND autotest -t witness_get_genealogical_comparison_for_witness_2)
add_test(NAME witness_get_genealogical_comparison_for_witness_3 COMMAND autotest -t witness_get_genealogical_comparison_for_witness_3)
add_test(NAME witness_get_substemmata COMMAND autotest -t witness_get_substemmata)
add_test(NAME witness_get_substemmata_single_solution COMMAND autotest -t witness_get_substemmata_single_solution)
add_test(NAME textual_flow_constructor_1 COMMAND autotest -t textual_flow_constructor_1)
add_test(NAME textual_flow_constructor_2 COMMAND autotest -t textual_flow_constructor_2)
add_test(NAME textual_flow_textual_flow_to_dot COMMAND autotest -t textual_flow_textual_flow_to_dot)
Expand Down
Loading

0 comments on commit 6680ed8

Please sign in to comment.