diff --git a/.vscode/settings.json b/.vscode/settings.json index 1549fc7..d3f60f8 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,8 +1,10 @@ { "cSpell.words": [ + "alphargs", "bilevel", "bmatrix", "diagflat", + "dtype", "Gorjanc", "Gregor", "Gregor's", @@ -15,6 +17,7 @@ "mathbb", "mathcal", "mubar", + "operatorname", "Pocrnić", "printoptions", "proxqp", @@ -22,5 +25,6 @@ "sqrtm", "vtype", "WNRM" - ] + ], + "python.analysis.typeCheckingMode": "basic" } diff --git a/Example/04/A04.txt b/Example/04/A04.txt new file mode 100644 index 0000000..3012329 --- /dev/null +++ b/Example/04/A04.txt @@ -0,0 +1,4 @@ +1 1 1 +2 2 1 +3 3 1 +4 4 1 diff --git a/Example/04/EBV04.txt b/Example/04/EBV04.txt new file mode 100644 index 0000000..be589c9 --- /dev/null +++ b/Example/04/EBV04.txt @@ -0,0 +1,4 @@ +1 +1 +2 +2 diff --git a/Example/04/S04.txt b/Example/04/S04.txt new file mode 100644 index 0000000..ae2331b --- /dev/null +++ b/Example/04/S04.txt @@ -0,0 +1,4 @@ +1 1 0.11111111111111 +2 2 0.11111111111111 +3 3 4.0 +4 4 4.0 \ No newline at end of file diff --git a/Example/A50.txt b/Example/50/A50.txt similarity index 100% rename from Example/A50.txt rename to Example/50/A50.txt diff --git a/Example/EBV50.txt b/Example/50/EBV50.txt similarity index 100% rename from Example/EBV50.txt rename to Example/50/EBV50.txt diff --git a/Example/S50.txt b/Example/50/S50.txt similarity index 100% rename from Example/S50.txt rename to Example/50/S50.txt diff --git a/Example/50/example.py b/Example/50/example.py new file mode 100644 index 0000000..6d649b3 --- /dev/null +++ b/Example/50/example.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python3 + +import sys; sys.path.insert(1, '../../') # HACK pre-module workaround +import alphargs + +# key problem variables loaded from standard format txt files +sigma, mubar, omega, n = alphargs.load_problem( + "A50.txt", + "EBV50.txt", + "S50.txt" +) + +# NOTE this trick of handling sex data is specific to the initial simulation +# data which is structured so that candidates alternate between sires (which +# have even indices) and dams (which have odd indices). +sires = range(0, n, 2) +dams = range(1, n, 2) + +lam = 0.5 +kap = 1 + +# computes the standard and robust genetic selection solutions +w_std, obj_std = alphargs.gurobi_standard_genetics( + sigma, mubar, sires, dams, lam, n) +w_rbs, z_rbs, obj_rbs = alphargs.gurobi_robust_genetics( + sigma, mubar, omega, sires, dams, lam, kap, n) + +alphargs.print_compare_solutions( + w_std, w_rbs, obj_std, obj_rbs, z2=z_rbs, name1="w_std", name2="w_rbs") + + +print("\nDone!") diff --git a/Example/README.md b/Example/README.md index b15b17e..8e04305 100644 --- a/Example/README.md +++ b/Example/README.md @@ -1,4 +1,9 @@ -# File Description +# Examples + +- [04/](04/) A scaled up version of the small $n = 2$ example created by Julian, replicating the same problem across sires and dams. +- [50/](50/) The 50 youngest individuals from a generated cohort of 12,000. Implications of this non-random example are unknown. + +## File Description In the original simulation, we had 12k individuals and 1k samples of their estimated breeding values (EBV) – our selection criterion. @@ -8,8 +13,6 @@ Based on that simulated data: - I constructed the S-matrix (12k x 12k), which is the covariance matrix between 1k EBV samples for 12k individuals. [S50] - The EBV vector (12k), which is posterior mean over 1k samples of EBV. [EBV50] -Since 12k might be a lot, I have extracted last 50 rows/columns from above 12k set, for your test. This corresponds to the youngest 50 individuals (I don’t know the implications of choosing this non-random sample ). - The matrices are saved in row, column, value format like we talked about before. _Note: this is an unedited version of Gregor's description of the files._ diff --git a/README.md b/README.md index 2b65d39..a5b3e0a 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,7 @@ -# Robust Genetics Optimization +# AlphaRGS -This repository is split four parts. The main Jupyter notebook which walks through how standard and robust optimization work for genetic selection from start to finish, part of which is [Example/](Example/), a realistic simulated example from Gregor Gorjanc and Ivan Pocrnić. We also have [Scratch/](Scratch/), a mix of Jupyter notebooks used as part of working out the problem with some additional data files and example scripts. Finally we have [Writing/](Writing/), a more formal writeup of the notebook and its findings in LaTeX (written by Julian, currently housed in Overleaf). +Tools for solving robust genetics selection problems in Python. It depends on Python 3.10+, [NumPy](https://pypi.org/project/numpy/), and [Gurobi](https://www.gurobi.com/) via [gurobipy](https://pypi.org/project/gurobipy/). + +## Structure + +This repository is split four parts. As well as the module in [`alphargs/`](alphargs/), there's [`Example/`](Example/) which includes a realistic simulated example from Gregor Gorjanc and Ivan Pocrnić. We also have [Scratch/](Scratch/) which includes a notebook which walks through how standard and robust optimization work for genetic selection from start to finish. Finally we have [Writing/](Writing/), a more formal writeup of the notebook and its findings in LaTeX (written by Julian, currently housed in Overleaf). diff --git a/Example2/A02.txt b/Scratch/Example/A02.txt similarity index 100% rename from Example2/A02.txt rename to Scratch/Example/A02.txt diff --git a/Example2/EBV02.txt b/Scratch/Example/EBV02.txt similarity index 100% rename from Example2/EBV02.txt rename to Scratch/Example/EBV02.txt diff --git a/Example2/README.md b/Scratch/Example/README.md similarity index 59% rename from Example2/README.md rename to Scratch/Example/README.md index 1a3bd3e..9324efb 100644 --- a/Example2/README.md +++ b/Scratch/Example/README.md @@ -1,3 +1,3 @@ # Example 2 -This is a small example for $n = 2$ created by Julian. It uses the same file formats as [Example 1](../Example/). +This is a small example for $n = 2$ created by Julian. It uses the same file formats as [Example 1](../../Example/). diff --git a/Example2/S02.txt b/Scratch/Example/S02.txt similarity index 100% rename from Example2/S02.txt rename to Scratch/Example/S02.txt diff --git a/Example2/n2-test.ipynb b/Scratch/Example/n2-test.ipynb similarity index 100% rename from Example2/n2-test.ipynb rename to Scratch/Example/n2-test.ipynb diff --git a/robust-genetics.ipynb b/Scratch/robust-genetics.ipynb similarity index 100% rename from robust-genetics.ipynb rename to Scratch/robust-genetics.ipynb diff --git a/alphargs/__init__.py b/alphargs/__init__.py new file mode 100644 index 0000000..496422d --- /dev/null +++ b/alphargs/__init__.py @@ -0,0 +1,10 @@ +# -*- coding: utf-8 -*- +""" +AlphaRGS provides functions for loading and solving robust genetic selection +problems. It uses file structures from the AlphaGenes family, Numpy for its +internal data structures, and depends on Gurobi for its solvers. +""" + +from .loaders import * +from .solvers import * +from .utils import * diff --git a/alphargs/loaders.py b/alphargs/loaders.py new file mode 100644 index 0000000..b0db25f --- /dev/null +++ b/alphargs/loaders.py @@ -0,0 +1,101 @@ +# -*- coding: utf-8 -*- +"""Loading in Genetics Data + +Genetics data could be presented to our solvers in multiple formats, these +functions define the appropriate methods for loading those in correctly. +""" + +import numpy as np # defines matrix structures +import numpy.typing as npt # variable typing definitions for NumPy + + +def load_ped(filename: str) -> dict: # TODO add dict typing + """ + Function for reading *.ped files to a dictionary. Takes the file name + as a string input and returns the pedigree structure as a dictionary. + """ + with open(filename, "r") as file: + # first line of *.ped lists the headers; skip + file.readline() + # create a list of int lists from each line (dropping optional labels) + data = [[int(x) for x in line.split(",")[0:3]] for line in file] + # convert this list of lists into a dictionary + ped = {entry[0]: entry[1:3] for entry in data} + + return ped + + +def makeA(pedigree: dict) -> npt.NDArray[np.float64]: # TODO add dict typing + """ + Construct Wright's Numerator Relationship Matrix from a given pedigree + structure. Takes the pedigree as a dictionary input and returns the + matrix as output. + """ + m = len(pedigree) + # preallocate memory for A + A = np.zeros((m, m), dtype=float) + + # iterate over rows + for i in range(0, m): + # save parent indexes: pedigrees indexed from 1, Python from 0 + p = pedigree[i+1][0]-1 + q = pedigree[i+1][1]-1 + # iterate over columns sub-diagonal + for j in range(0, i): + # calculate sub-diagonal entries + A[i, j] = 0.5*(A[j, p] + A[j, q]) + # populate sup-diagonal (symmetric) + A[j, i] = A[i, j] + # calculate diagonal entries + A[i, i] = 1 + 0.5*A[p, q] + + return A + + +def load_problem(A_filename: str, E_filename: str, S_filename: str, + dimension: int | None = None, pedigree: bool = False + ) -> tuple[npt.NDArray[np.float64], npt.NDArray[np.float64], + npt.NDArray[np.float64], int]: + """ + Used to load genetic selection problems into NumPy. It takes three + string inputs for filenames where Sigma, Mu, and Omega are stored, + as well as an optional integer input for problem dimension if this + is known. If it's know know, it's worked out based on E_filename. + + As output, it returns (A, E, S, n), where A and S are n-by-n NumPy + arrays, E is a length n NumPy array, and n is an integer. + """ + + def load_symmetric_matrix(filename: str, dimension: int + ) -> npt.NDArray[np.float64]: + """ + Since NumPy doesn't have a stock way to load matrices + stored in coordinate format format, this adds one. + """ + + matrix = np.zeros([dimension, dimension], dtype=float) + + with open(filename, 'r') as file: + for line in file: + i, j, entry = line.split(" ") + # data files indexed from 1, not 0 + matrix[int(i)-1, int(j)-1] = entry + matrix[int(j)-1, int(i)-1] = entry + + return matrix + + E = np.loadtxt(E_filename, dtype=float) + # if dimension not specified, use `E` which doesn't need preallocation + if not dimension: + assert isinstance(E.size, int) # catches E being empty + dimension = E.size + + # S is stored by coordinates so need special loader + S = load_symmetric_matrix(S_filename, dimension) + # A can be stored as a pedigree or by coordinates + if pedigree: + A = makeA(load_ped(A_filename)) + else: + A = load_symmetric_matrix(A_filename, dimension) + + return A, E, S, dimension diff --git a/alphargs/solvers.py b/alphargs/solvers.py new file mode 100644 index 0000000..f0dacc8 --- /dev/null +++ b/alphargs/solvers.py @@ -0,0 +1,142 @@ +# -*- coding: utf-8 -*- +"""Defining Solvers + +With the problems properly loaded into Numpy, this section contains functions +for solving those under various formulations and methods. +""" + +import numpy as np # defines matrix structures +import numpy.typing as npt # variable typing definitions for NumPy +import gurobipy as gp # Gurobi optimization interface (1) +from gurobipy import GRB # Gurobi optimization interface (2) + + +def gurobi_standard_genetics( + sigma: np.ndarray, + mu: np.ndarray, + sires, # type could be np.ndarray, sets[ints], lists[int], range, etc + dams, # type could be np.ndarray, sets[ints], lists[int], range, etc + lam: float, + dimension: int, + upper_bound: npt.NDArray[np.float64] | float = 1.0, + lower_bound: npt.NDArray[np.float64] | float = 0.0, + time_limit: float | None = None, + max_duality_gap: float | None = None, + debug: bool = False +): # -> tuple[npt.NDArray[np.float64], float]: # BUG Gurobi typing broken + """ + Takes a sigma, mu, sire list, dam list, and dimension as inputs and solves + the standard genetic selection problem with Gurobi for a given value of + lambda (lam), returning the portfolio and objective value as outputs. The + default lower and upper bounds of 0 and 1 can be changed also. + + Optional arguments time_limit and max_duality_gap respectively control how + long Gurobi will spend on the problem and the maximum allowable duality + gap. Optional argument debug sets whether Gurobi prints output to terminal. + """ + + # create models for standard and robust genetic selection + model = gp.Model("standard-genetics") + + # Gurobi spews all its output into the terminal by default, this restricts + # that behaviour to only happen when the `debug` flag is used. + if not debug: model.setParam('OutputFlag', 0) + + # integrating bounds within variable definitions is more efficient than + # as a separate constraint, which Gurobi would convert to bounds anyway + w = model.addMVar(shape=dimension, lb=lower_bound, ub=upper_bound, + vtype=GRB.CONTINUOUS, name="w") + + model.setObjective( + # NOTE Gurobi introduces error if we use `np.inner(w, sigma@w)` here + w.transpose()@mu - (lam/2)*w.transpose()@(sigma@w), + GRB.MAXIMIZE + ) + + # set up the two sum-to-half constraints + M = np.zeros((2, dimension), dtype=int) + # define the M so that column i is [1;0] if i is a sire and [0;1] otherwise + M[0, sires] = 1 + M[1, dams] = 1 + # define the right hand side of the constraint Mx = m + m = np.full(2, 0.5) + model.addConstr(M@w == m, name="sum-to-half") + + # optional controls to stop Gurobi taking too long + if time_limit: model.setParam(GRB.Param.TimeLimit, time_limit) + if max_duality_gap: model.setParam('MIPGap', max_duality_gap) + + # model file can be used externally for verification + if debug: model.write("standard-opt.mps") + + model.optimize() + return w.X, model.ObjVal + + +def gurobi_robust_genetics( + sigma: npt.NDArray[np.float64], + mubar: npt.NDArray[np.float64], + omega: npt.NDArray[np.float64], + sires, # type could be np.ndarray, sets[ints], lists[int], range, etc + dams, # type could be np.ndarray, sets[ints], lists[int], range, etc + lam: float, + kappa: float, + dimension: int, + upper_bound: npt.NDArray[np.float64] | float = 1.0, + lower_bound: npt.NDArray[np.float64] | float = 0.0, + time_limit: float | None = None, + max_duality_gap: float | None = None, + debug: bool = False +): # -> tuple[npt.NDArray, float, float]: # BUG Gurobi typing broken + """ + Takes a sigma, mu-bar, omega, sire list, dam list, and dimension as inputs + and solves the robust genetic selection problem using Gurobi for given + values of lambda (lam) and kappa. It returns the portfolio and objective + value as outputs. The default lower and upper bounds of 0 and 1 can be + changed also. + + Optional arguments time_limit and max_duality_gap respectively control how + long Gurobi will spend on the problem and the maximum allowable duality + gap. Optional argument debug sets whether Gurobi prints output to terminal. + """ + + # create models for standard and robust genetic selection + model = gp.Model("robust-genetics") + + # Gurobi spews all its output into the terminal by default, this restricts + # that behaviour to only happen when the `debug` flag is used. + if not debug: model.setParam('OutputFlag', 0) + + # integrating bounds within variable definitions is more efficient than + # as a separate constraint, which Gurobi would convert to bounds anyway + w = model.addMVar(shape=dimension, lb=lower_bound, ub=upper_bound, + vtype=GRB.CONTINUOUS, name="w") + z = model.addVar(lb=0.0, name="z") + + model.setObjective( + # NOTE Gurobi introduces error if we use `np.inner(w, sigma@w)` here + w.transpose()@mubar - (lam/2)*w.transpose()@(sigma@w) - kappa*z, + GRB.MAXIMIZE + ) + + # set up the two sum-to-half constraints + M = np.zeros((2, dimension), dtype=int) + # define the M so that column i is [1;0] if i is a sire and [0;1] otherwise + M[0, sires] = 1 + M[1, dams] = 1 + # define the right hand side of the constraint Mx = m + m = np.full(2, 0.5, dtype=float) + model.addConstr(M@w == m, name="sum-to-half") + + # conic constraint which comes from robust optimization + model.addConstr(z**2 >= w.transpose()@omega@w, name="uncertainty") + + # optional controls to stop Gurobi taking too long + if time_limit: model.setParam(GRB.Param.TimeLimit, time_limit) + if max_duality_gap: model.setParam('MIPGap', max_duality_gap) + + # model file can be used externally for verification + if debug: model.write("robust-opt.mps") + + model.optimize() + return w.X, z.X, model.ObjVal diff --git a/alphargs/utils.py b/alphargs/utils.py new file mode 100644 index 0000000..d0a6cfe --- /dev/null +++ b/alphargs/utils.py @@ -0,0 +1,55 @@ +# -*- coding: utf-8 -*- +"""Utilities + +This file contains additional utilities, such as for printing and comparing +portfolios produced by the solvers. +""" + +import numpy as np + + +def print_compare_solutions( + portfolio1, # : npt.NDArray[np.float64], # BUG Gurobi typing broken + portfolio2, # : npt.NDArray[np.float64], # BUG Gurobi typing broken + objective1: float, + objective2: float, + precision: int = 5, + z1: float | None = None, + z2: float | None = None, + name1: str = "First", + name2: str = "Second" +) -> None: + """ + Takes two solutions (comprised of at least a portfolio and objective value, + plus an optional z-value and/or solution name) as inputs, and prints a + comparison of the two solutions to the terminal. The number of decimals + values are displayed to defaults to 5, but can be changed through the + precision argument. + """ + + dimension = portfolio1.size + order = len(str(dimension)) + + # HACK header breaks if precision < 3 or len(problem1) != 5 + print(f"i{' '*(order-1)} {name1} {' '*(precision-3)}{name2}") + for candidate in range(dimension): + print( + f"{candidate+1:0{order}d} " + f"{portfolio1[candidate]:.{precision}f} " + f"{portfolio2[candidate]:.{precision}f}" + ) + + def obj_string(name: str, value: float, precision: int, + z: float | None = None) -> str: + """Helper function which handles the optional z1 and z2 values""" + obj_str = f"{name}: {value:.{precision}f}" + return f"{obj_str} (z = {z:.{precision}f})" if z else f"{obj_str}" + + portfolio_abs_diff = np.abs(portfolio1-portfolio2) + print( + f"\n{obj_string(f'{name1} objective', objective1, precision, z1)}" + f"\n{obj_string(f'{name2} objective', objective2, precision, z2)}" + f"\nMaximum change: {max(portfolio_abs_diff):.{precision}f}" + f"\nAverage change: {np.mean(portfolio_abs_diff):.{precision}f}" + f"\nMinimum change: {min(portfolio_abs_diff):.{precision}f}" + ) diff --git a/example.py b/example.py new file mode 100644 index 0000000..80c7f49 --- /dev/null +++ b/example.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python3 + +import alphargs + +# key problem variables loaded from standard format txt files +sigma, mubar, omega, n = alphargs.load_problem( + "Example/04/A04.txt", + "Example/04/EBV04.txt", + "Example/04/S04.txt" +) + +# NOTE this trick of handling sex data is specific to the initial simulation +# data which is structured so that candidates alternate between sires (which +# have even indices) and dams (which have odd indices). +sires = range(0, n, 2) +dams = range(1, n, 2) + +lam = 0.5 +kap = 1 + +# computes the standard and robust genetic selection solutions +w_std, obj_std = alphargs.gurobi_standard_genetics( + sigma, mubar, sires, dams, lam, n) +w_rbs, z_rbs, obj_rbs = alphargs.gurobi_robust_genetics( + sigma, mubar, omega, sires, dams, lam, kap, n) + +alphargs.print_compare_solutions( + w_std, w_rbs, obj_std, obj_rbs, z2=z_rbs, name1="w_std", name2="w_rbs") + + +print("\nDone!")