Merge pull request #6 from Foggalong/python-rewrite

Python Rewrite
Foggalong · Apr 30, 2024 · a757434 · a757434
2 parents a8732c5 + 4e7ba82
commit a757434
Show file tree

Hide file tree

Showing 21 changed files with 401 additions and 7 deletions.
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -1,8 +1,10 @@
 {
     "cSpell.words": [
+        "alphargs",
         "bilevel",
         "bmatrix",
         "diagflat",
+        "dtype",
         "Gorjanc",
         "Gregor",
         "Gregor's",
@@ -15,12 +17,14 @@
         "mathbb",
         "mathcal",
         "mubar",
+        "operatorname",
         "Pocrnić",
         "printoptions",
         "proxqp",
         "qpsolvers",
         "sqrtm",
         "vtype",
         "WNRM"
-    ]
+    ],
+    "python.analysis.typeCheckingMode": "basic"
 }
diff --git a/Example/04/A04.txt b/Example/04/A04.txt
@@ -0,0 +1,4 @@
+1 1 1
+2 2 1
+3 3 1
+4 4 1
diff --git a/Example/04/EBV04.txt b/Example/04/EBV04.txt
@@ -0,0 +1,4 @@
+1
+1
+2
+2
diff --git a/Example/04/S04.txt b/Example/04/S04.txt
@@ -0,0 +1,4 @@
+1 1 0.11111111111111
+2 2 0.11111111111111
+3 3 4.0
+4 4 4.0
diff --git a/Example/A50.txt → Example/50/A50.txt b/Example/A50.txt → Example/50/A50.txt
diff --git a/Example/EBV50.txt → Example/50/EBV50.txt b/Example/EBV50.txt → Example/50/EBV50.txt
diff --git a/Example/S50.txt → Example/50/S50.txt b/Example/S50.txt → Example/50/S50.txt
diff --git a/Example/50/example.py b/Example/50/example.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python3
+
+import sys; sys.path.insert(1, '../../')  # HACK pre-module workaround
+import alphargs
+
+# key problem variables loaded from standard format txt files
+sigma, mubar, omega, n = alphargs.load_problem(
+    "A50.txt",
+    "EBV50.txt",
+    "S50.txt"
+)
+
+# NOTE this trick of handling sex data is specific to the initial simulation
+# data which is structured so that candidates alternate between sires (which
+# have even indices) and dams (which have odd indices).
+sires = range(0, n, 2)
+dams = range(1, n, 2)
+
+lam = 0.5
+kap = 1
+
+# computes the standard and robust genetic selection solutions
+w_std, obj_std = alphargs.gurobi_standard_genetics(
+    sigma, mubar, sires, dams, lam, n)
+w_rbs, z_rbs, obj_rbs = alphargs.gurobi_robust_genetics(
+    sigma, mubar, omega, sires, dams, lam, kap, n)
+
+alphargs.print_compare_solutions(
+    w_std, w_rbs, obj_std, obj_rbs, z2=z_rbs, name1="w_std", name2="w_rbs")
+
+
+print("\nDone!")
diff --git a/Example/README.md b/Example/README.md
@@ -1,4 +1,9 @@
-# File Description
+# Examples
+
+- [04/](04/) A scaled up version of the small $n = 2$ example created by Julian, replicating the same problem across sires and dams.
+- [50/](50/) The 50 youngest individuals from a generated cohort of 12,000. Implications of this non-random example are unknown.
+
+## File Description
 
 In the original simulation, we had 12k individuals and 1k samples of their estimated breeding values (EBV) – our selection criterion.
 
@@ -8,8 +13,6 @@ Based on that simulated data:
 - I constructed the S-matrix (12k x 12k), which is the covariance matrix between 1k EBV samples for 12k individuals. [S50]
 - The EBV vector (12k), which is posterior mean over 1k samples of EBV. [EBV50]
 
-Since 12k might be a lot, I have extracted last 50 rows/columns from above 12k set, for your test. This corresponds to the youngest 50 individuals (I don’t know the implications of choosing this non-random sample ).
-
 The matrices are saved in row, column, value format like we talked about before.
 
 _Note: this is an unedited version of Gregor's description of the files._
diff --git a/README.md b/README.md
@@ -1,3 +1,7 @@
-# Robust Genetics Optimization
+# AlphaRGS
 
-This repository is split four parts. The main Jupyter notebook which walks through how standard and robust optimization work for genetic selection from start to finish, part of which is [Example/](Example/), a realistic simulated example from Gregor Gorjanc and Ivan Pocrnić. We also have [Scratch/](Scratch/), a mix of Jupyter notebooks used as part of working out the problem with some additional data files and example scripts. Finally we have [Writing/](Writing/), a more formal writeup of the notebook and its findings in LaTeX (written by Julian, currently housed in Overleaf).
+Tools for solving robust genetics selection problems in Python. It depends on Python 3.10+, [NumPy](https://pypi.org/project/numpy/), and [Gurobi](https://www.gurobi.com/) via [gurobipy](https://pypi.org/project/gurobipy/).  
+
+## Structure
+
+This repository is split four parts. As well as the module in [`alphargs/`](alphargs/), there's [`Example/`](Example/) which includes a realistic simulated example from Gregor Gorjanc and Ivan Pocrnić. We also have [Scratch/](Scratch/) which includes a notebook which walks through how standard and robust optimization work for genetic selection from start to finish. Finally we have [Writing/](Writing/), a more formal writeup of the notebook and its findings in LaTeX (written by Julian, currently housed in Overleaf).
diff --git a/Example2/A02.txt → Scratch/Example/A02.txt b/Example2/A02.txt → Scratch/Example/A02.txt
diff --git a/Example2/EBV02.txt → Scratch/Example/EBV02.txt b/Example2/EBV02.txt → Scratch/Example/EBV02.txt
diff --git a/Example2/README.md → Scratch/Example/README.md b/Example2/README.md → Scratch/Example/README.md
@@ -1,3 +1,3 @@
 # Example 2
 
-This is a small example for $n = 2$ created by Julian. It uses the same file formats as [Example 1](../Example/).
+This is a small example for $n = 2$ created by Julian. It uses the same file formats as [Example 1](../../Example/).
diff --git a/Example2/S02.txt → Scratch/Example/S02.txt b/Example2/S02.txt → Scratch/Example/S02.txt
diff --git a/Example2/n2-test.ipynb → Scratch/Example/n2-test.ipynb b/Example2/n2-test.ipynb → Scratch/Example/n2-test.ipynb
diff --git a/robust-genetics.ipynb → Scratch/robust-genetics.ipynb b/robust-genetics.ipynb → Scratch/robust-genetics.ipynb
diff --git a/alphargs/__init__.py b/alphargs/__init__.py
@@ -0,0 +1,10 @@
+# -*- coding: utf-8 -*-
+"""
+AlphaRGS provides functions for loading and solving robust genetic selection
+problems. It uses file structures from the AlphaGenes family, Numpy for its
+internal data structures, and depends on Gurobi for its solvers.
+"""
+
+from .loaders import *
+from .solvers import *
+from .utils import *
diff --git a/alphargs/loaders.py b/alphargs/loaders.py
@@ -0,0 +1,101 @@
+# -*- coding: utf-8 -*-
+"""Loading in Genetics Data
+
+Genetics data could be presented to our solvers in multiple formats, these
+functions define the appropriate methods for loading those in correctly.
+"""
+
+import numpy as np          # defines matrix structures
+import numpy.typing as npt  # variable typing definitions for NumPy
+
+
+def load_ped(filename: str) -> dict:  # TODO add dict typing
+    """
+    Function for reading *.ped files to a dictionary. Takes the file name
+    as a string input and returns the pedigree structure as a dictionary.
+    """
+    with open(filename, "r") as file:
+        # first line of *.ped lists the headers; skip
+        file.readline()
+        # create a list of int lists from each line (dropping optional labels)
+        data = [[int(x) for x in line.split(",")[0:3]] for line in file]
+    # convert this list of lists into a dictionary
+    ped = {entry[0]: entry[1:3] for entry in data}
+
+    return ped
+
+
+def makeA(pedigree: dict) -> npt.NDArray[np.float64]:  # TODO add dict typing
+    """
+    Construct Wright's Numerator Relationship Matrix from a given pedigree
+    structure. Takes the pedigree as a dictionary input and returns the
+    matrix as output.
+    """
+    m = len(pedigree)
+    # preallocate memory for A
+    A = np.zeros((m, m), dtype=float)
+
+    # iterate over rows
+    for i in range(0, m):
+        # save parent indexes: pedigrees indexed from 1, Python from 0
+        p = pedigree[i+1][0]-1
+        q = pedigree[i+1][1]-1
+        # iterate over columns sub-diagonal
+        for j in range(0, i):
+            # calculate sub-diagonal entries
+            A[i, j] = 0.5*(A[j, p] + A[j, q])
+            # populate sup-diagonal (symmetric)
+            A[j, i] = A[i, j]
+        # calculate diagonal entries
+        A[i, i] = 1 + 0.5*A[p, q]
+
+    return A
+
+
+def load_problem(A_filename: str, E_filename: str, S_filename: str,
+                 dimension: int | None = None, pedigree: bool = False
+                 ) -> tuple[npt.NDArray[np.float64], npt.NDArray[np.float64],
+                            npt.NDArray[np.float64], int]:
+    """
+    Used to load genetic selection problems into NumPy. It takes three
+    string inputs for filenames where Sigma, Mu, and Omega are stored,
+    as well as an optional integer input for problem dimension if this
+    is known. If it's know know, it's worked out based on E_filename.
+
+    As output, it returns (A, E, S, n), where A and S are n-by-n NumPy
+    arrays, E is a length n NumPy array, and n is an integer.
+    """
+
+    def load_symmetric_matrix(filename: str, dimension: int
+                              ) -> npt.NDArray[np.float64]:
+        """
+        Since NumPy doesn't have a stock way to load matrices
+        stored in coordinate format format, this adds one.
+        """
+
+        matrix = np.zeros([dimension, dimension], dtype=float)
+
+        with open(filename, 'r') as file:
+            for line in file:
+                i, j, entry = line.split(" ")
+                # data files indexed from 1, not 0
+                matrix[int(i)-1, int(j)-1] = entry
+                matrix[int(j)-1, int(i)-1] = entry
+
+        return matrix
+
+    E = np.loadtxt(E_filename, dtype=float)
+    # if dimension not specified, use `E` which doesn't need preallocation
+    if not dimension:
+        assert isinstance(E.size, int)  # catches E being empty
+        dimension = E.size
+
+    # S is stored by coordinates so need special loader
+    S = load_symmetric_matrix(S_filename, dimension)
+    # A can be stored as a pedigree or by coordinates
+    if pedigree:
+        A = makeA(load_ped(A_filename))
+    else:
+        A = load_symmetric_matrix(A_filename, dimension)
+
+    return A, E, S, dimension
diff --git a/alphargs/solvers.py b/alphargs/solvers.py
@@ -0,0 +1,142 @@
+# -*- coding: utf-8 -*-
+"""Defining Solvers
+
+With the problems properly loaded into Numpy, this section contains functions
+for solving those under various formulations and methods.
+"""
+
+import numpy as np          # defines matrix structures
+import numpy.typing as npt  # variable typing definitions for NumPy
+import gurobipy as gp       # Gurobi optimization interface (1)
+from gurobipy import GRB    # Gurobi optimization interface (2)
+
+
+def gurobi_standard_genetics(
+    sigma: np.ndarray,
+    mu: np.ndarray,
+    sires,  # type could be np.ndarray, sets[ints], lists[int], range, etc
+    dams,   # type could be np.ndarray, sets[ints], lists[int], range, etc
+    lam: float,
+    dimension: int,
+    upper_bound: npt.NDArray[np.float64] | float = 1.0,
+    lower_bound: npt.NDArray[np.float64] | float = 0.0,
+    time_limit: float | None = None,
+    max_duality_gap: float | None = None,
+    debug: bool = False
+):  # -> tuple[npt.NDArray[np.float64], float]:  # BUG Gurobi typing broken
+    """
+    Takes a sigma, mu, sire list, dam list, and dimension as inputs and solves
+    the standard genetic selection problem with Gurobi for a given value of
+    lambda (lam), returning the portfolio and objective value as outputs. The
+    default lower and upper bounds of 0 and 1 can be changed also.
+
+    Optional arguments time_limit and max_duality_gap respectively control how
+    long Gurobi will spend on the problem and the maximum allowable duality
+    gap. Optional argument debug sets whether Gurobi prints output to terminal.
+    """
+
+    # create models for standard and robust genetic selection
+    model = gp.Model("standard-genetics")
+
+    # Gurobi spews all its output into the terminal by default, this restricts
+    # that behaviour to only happen when the `debug` flag is used.
+    if not debug: model.setParam('OutputFlag', 0)
+
+    # integrating bounds within variable definitions is more efficient than
+    # as a separate constraint, which Gurobi would convert to bounds anyway
+    w = model.addMVar(shape=dimension, lb=lower_bound, ub=upper_bound,
+                      vtype=GRB.CONTINUOUS, name="w")
+
+    model.setObjective(
+        # NOTE Gurobi introduces error if we use `np.inner(w, sigma@w)` here
+        w.transpose()@mu - (lam/2)*w.transpose()@(sigma@w),
+        GRB.MAXIMIZE
+    )
+
+    # set up the two sum-to-half constraints
+    M = np.zeros((2, dimension), dtype=int)
+    # define the M so that column i is [1;0] if i is a sire and [0;1] otherwise
+    M[0, sires] = 1
+    M[1, dams] = 1
+    # define the right hand side of the constraint Mx = m
+    m = np.full(2, 0.5)
+    model.addConstr(M@w == m, name="sum-to-half")
+
+    # optional controls to stop Gurobi taking too long
+    if time_limit: model.setParam(GRB.Param.TimeLimit, time_limit)
+    if max_duality_gap: model.setParam('MIPGap', max_duality_gap)
+
+    # model file can be used externally for verification
+    if debug: model.write("standard-opt.mps")
+
+    model.optimize()
+    return w.X, model.ObjVal
+
+
+def gurobi_robust_genetics(
+    sigma: npt.NDArray[np.float64],
+    mubar: npt.NDArray[np.float64],
+    omega: npt.NDArray[np.float64],
+    sires,  # type could be np.ndarray, sets[ints], lists[int], range, etc
+    dams,   # type could be np.ndarray, sets[ints], lists[int], range, etc
+    lam: float,
+    kappa: float,
+    dimension: int,
+    upper_bound: npt.NDArray[np.float64] | float = 1.0,
+    lower_bound: npt.NDArray[np.float64] | float = 0.0,
+    time_limit: float | None = None,
+    max_duality_gap: float | None = None,
+    debug: bool = False
+):  # -> tuple[npt.NDArray, float, float]:  # BUG Gurobi typing broken
+    """
+    Takes a sigma, mu-bar, omega, sire list, dam list, and dimension as inputs
+    and solves the robust genetic selection problem using Gurobi for given
+    values of lambda (lam) and kappa. It returns the portfolio and objective
+    value as outputs. The default lower and upper bounds of 0 and 1 can be
+    changed also.
+
+    Optional arguments time_limit and max_duality_gap respectively control how
+    long Gurobi will spend on the problem and the maximum allowable duality
+    gap. Optional argument debug sets whether Gurobi prints output to terminal.
+    """
+
+    # create models for standard and robust genetic selection
+    model = gp.Model("robust-genetics")
+
+    # Gurobi spews all its output into the terminal by default, this restricts
+    # that behaviour to only happen when the `debug` flag is used.
+    if not debug: model.setParam('OutputFlag', 0)
+
+    # integrating bounds within variable definitions is more efficient than
+    # as a separate constraint, which Gurobi would convert to bounds anyway
+    w = model.addMVar(shape=dimension, lb=lower_bound, ub=upper_bound,
+                      vtype=GRB.CONTINUOUS, name="w")
+    z = model.addVar(lb=0.0, name="z")
+
+    model.setObjective(
+        # NOTE Gurobi introduces error if we use `np.inner(w, sigma@w)` here
+        w.transpose()@mubar - (lam/2)*w.transpose()@(sigma@w) - kappa*z,
+        GRB.MAXIMIZE
+    )
+
+    # set up the two sum-to-half constraints
+    M = np.zeros((2, dimension), dtype=int)
+    # define the M so that column i is [1;0] if i is a sire and [0;1] otherwise
+    M[0, sires] = 1
+    M[1, dams] = 1
+    # define the right hand side of the constraint Mx = m
+    m = np.full(2, 0.5, dtype=float)
+    model.addConstr(M@w == m, name="sum-to-half")
+
+    # conic constraint which comes from robust optimization
+    model.addConstr(z**2 >= w.transpose()@omega@w, name="uncertainty")
+
+    # optional controls to stop Gurobi taking too long
+    if time_limit: model.setParam(GRB.Param.TimeLimit, time_limit)
+    if max_duality_gap: model.setParam('MIPGap', max_duality_gap)
+
+    # model file can be used externally for verification
+    if debug: model.write("robust-opt.mps")
+
+    model.optimize()
+    return w.X, z.X, model.ObjVal
-Original file line number
+Diff line change
@@ -0,0 +1,4 @@
+1 1
+2 1
+3 1
+4 1