From 6d4b2e3bd1981092ff437738f8b7d07b4db02690 Mon Sep 17 00:00:00 2001
From: Josh Fogg <j.fogg@ed.ac.uk>
Date: Wed, 24 Jul 2024 16:46:22 +0100
Subject: [PATCH] Typing and loading improvements (#22)

Making better use of NumPy's generic C types, including `np.bool` for the binary matrix M. Loading wise, COO and CSR have been consolidated into a single function.
---
 robustocs/loaders.py | 88 +++++++++++++++++++-------------------------
 robustocs/solvers.py | 77 +++++++++++++++++++-------------------
 robustocs/utils.py   |  8 ++--
 3 files changed, 79 insertions(+), 94 deletions(-)

diff --git a/robustocs/loaders.py b/robustocs/loaders.py
index 3498865..ac8cce4 100644
--- a/robustocs/loaders.py
+++ b/robustocs/loaders.py
@@ -21,8 +21,8 @@
 
 def count_sparse_nnz(filename: str) -> int:
     """
-    Return the number of non-zero entries for a matrix stored in symmetric
-    sparse coordinate format.
+    Return the number of non-zero entries for a matrix stored in a file
+    in symmetric sparse coordinate (COO) format.
 
     Parameters
     ----------
@@ -48,7 +48,7 @@ def count_sparse_nnz(filename: str) -> int:
 
 
 def load_symmetric_matrix(filename: str, dimension: int
-                          ) -> npt.NDArray[np.float64]:
+                          ) -> npt.NDArray[np.floating]:
     """
     Since NumPy doesn't have a stock way to load symmetric matrices stored in
     symmetric coordinate format, this adds one.
@@ -66,7 +66,7 @@ def load_symmetric_matrix(filename: str, dimension: int
         The matrix represented by the file.
     """
 
-    matrix = np.zeros([dimension, dimension], dtype=float)
+    matrix = np.zeros([dimension, dimension], dtype=np.floating)
 
     with open(filename, 'r') as file:
         for line in file:
@@ -78,11 +78,11 @@ def load_symmetric_matrix(filename: str, dimension: int
     return matrix
 
 
-def load_symmetric_matrix_coo(filename: str, dimension: int, nnz: int
-                              ) -> sparse.spmatrix:
+def load_sparse_symmetric_matrix(filename: str, dimension: int, nnz: int,
+                                 format: str = 'csr') -> sparse.spmatrix:
     """
     Since neither NumPy or SciPy have a stock way to load symmetric matrices
-    into sparse coordinate format, this adds one.
+    stored in symmetric coordinate format into SciPy's formats, this adds one.
 
     Parameters
     ----------
@@ -93,17 +93,21 @@ def load_symmetric_matrix_coo(filename: str, dimension: int, nnz: int
     nnz : int
         Number of non-zero entries in the matrix stored in the file. Note that
         due to symmetry, this may be larger than the number of lines in file.
+    format : str
+        Format to use for the sparse matrix. Options available are 'coo' (for
+        sparse coordinate format) or 'csr' (compressed sparse row format).
+        Default value is `csr`.
 
     Returns
     -------
-    ndarray
+    spmatrix
         The matrix represented by the file.
     """
 
     # preallocate storage arrays
-    rows: npt.NDArray[np.int8] = np.zeros(nnz)
-    cols: npt.NDArray[np.int8] = np.zeros(nnz)
-    vals: npt.NDArray[np.float64] = np.zeros(nnz)
+    rows: npt.NDArray[np.integer] = np.zeros(nnz, dtype=np.integer)
+    cols: npt.NDArray[np.integer] = np.zeros(nnz, dtype=np.integer)
+    vals: npt.NDArray[np.floating] = np.zeros(nnz, dtype=np.floating)
 
     with open(filename, 'r') as file:
         index: int = 0
@@ -126,38 +130,20 @@ def load_symmetric_matrix_coo(filename: str, dimension: int, nnz: int
 
             index += 1
 
-    return sparse.coo_matrix(
-        (vals, (rows, cols)),
-        shape=(dimension, dimension),
-        dtype=np.float64
-    )
-
-
-def load_symmetric_matrix_csr(filename: str, dimension: int, nnz: int
-                              ) -> sparse.spmatrix:
-    """
-    Loads a symmetric matrix into compressed sparse row format. It does this
-    by first loading into sparse coordinate format and then converting with
-    Scipy. TODO: add a more efficient way which loads into CSR directly.
-
-    Parameters
-    ----------
-    filename : str
-        Filename, including extension. Space-separated data file.
-    dimension : int
-        Number of rows (and columns) of the matrix stored in the file.
-    nnz : int
-        Number of non-zero entries in the matrix stored in the file. Note that
-        due to symmetry, this may be larger than the number of lines in file.
-
-    Returns
-    -------
-    ndarray
-        The matrix represented by the file.
-    """
-
-    matrix = load_symmetric_matrix_coo(filename, dimension, nnz)
-    return sparse.csr_matrix(matrix)
+    if format == 'coo':
+        return sparse.coo_matrix(
+            (vals, (rows, cols)),
+            shape=(dimension, dimension),
+            dtype=np.floating
+        )
+    elif format == 'csr':
+        return sparse.csr_matrix(
+            (vals, (rows, cols)),
+            shape=(dimension, dimension),
+            dtype=np.floating
+        )
+    else:
+        raise ValueError("Format must be 'coo' or 'csr'.")
 
 
 def load_ped(filename: str) -> dict[int, list[int]]:
@@ -191,7 +177,7 @@ def load_ped(filename: str) -> dict[int, list[int]]:
 # MATRIX GENERATORS
 # Utility functions for generating matrices from pedigree data.
 
-def makeA(pedigree: dict[int, list[int]]) -> npt.NDArray[np.float64]:
+def makeA(pedigree: dict[int, list[int]]) -> npt.NDArray[np.floating]:
     """
     Constructs Wright's Numerator Relationship Matrix (WNRM) from a given
     pedigree structure.
@@ -210,7 +196,7 @@ def makeA(pedigree: dict[int, list[int]]) -> npt.NDArray[np.float64]:
 
     m = len(pedigree)
     # preallocate memory for A
-    A = np.zeros((m, m), dtype=float)
+    A = np.zeros((m, m), dtype=np.floating)
 
     # iterate over rows
     for i in range(0, m):
@@ -238,9 +224,9 @@ def load_problem(A_filename: str, E_filename: str, S_filename: str,
                  nnzA: int | None = None, nnzS: int | None = None,
                  dimension: int | None = None, pedigree: bool = False,
                  issparse: bool = False
-                 ) -> tuple[npt.NDArray[np.float64] | sparse.spmatrix,
-                            npt.NDArray[np.float64],
-                            npt.NDArray[np.float64] | sparse.spmatrix,
+                 ) -> tuple[npt.NDArray[np.floating] | sparse.spmatrix,
+                            npt.NDArray[np.floating],
+                            npt.NDArray[np.floating] | sparse.spmatrix,
                             int]:
     """
     Load a robust genetic selection problem into Python.
@@ -286,7 +272,7 @@ def load_problem(A_filename: str, E_filename: str, S_filename: str,
         Dimension of the problem.
     """
 
-    E = np.loadtxt(E_filename, dtype=float)
+    E = np.loadtxt(E_filename, dtype=np.floating)
     # if dimension not specified, use `E` which doesn't need preallocation
     if not dimension:
         assert isinstance(E.size, int)  # catches E being empty
@@ -296,7 +282,7 @@ def load_problem(A_filename: str, E_filename: str, S_filename: str,
     if issparse:
         if not nnzS:
             nnzS = count_sparse_nnz(S_filename)
-        S = load_symmetric_matrix_csr(S_filename, dimension, nnzS)
+        S = load_sparse_symmetric_matrix(S_filename, dimension, nnzS)
     else:
         # if nnzS was defined here, it's ignored as a parameter
         S = load_symmetric_matrix(S_filename, dimension)
@@ -312,7 +298,7 @@ def load_problem(A_filename: str, E_filename: str, S_filename: str,
         if issparse:
             if not nnzA:
                 nnzA = count_sparse_nnz(A_filename)
-            A = load_symmetric_matrix_csr(A_filename, dimension, nnzA)
+            A = load_sparse_symmetric_matrix(A_filename, dimension, nnzA)
         else:
             # if nnzA was defined here, it's ignored as a parameter
             A = load_symmetric_matrix(A_filename, dimension)
diff --git a/robustocs/solvers.py b/robustocs/solvers.py
index 58ba4e7..60d4d18 100644
--- a/robustocs/solvers.py
+++ b/robustocs/solvers.py
@@ -23,19 +23,19 @@
 
 
 def gurobi_standard_genetics(
-    sigma: npt.NDArray[np.float64] | sparse.spmatrix,
-    mu: npt.NDArray[np.float64],
+    sigma: npt.NDArray[np.floating] | sparse.spmatrix,
+    mu: npt.NDArray[np.floating],
     sires,  # type could be np.ndarray, sets[ints], lists[int], range, etc
     dams,   # type could be np.ndarray, sets[ints], lists[int], range, etc
     lam: float,  # cannot be called `lambda`, that's reserved in Python
     dimension: int,
-    upper_bound: npt.NDArray[np.float64] | float = 1.0,
-    lower_bound: npt.NDArray[np.float64] | float = 0.0,
+    upper_bound: npt.NDArray[np.floating] | float = 1.0,
+    lower_bound: npt.NDArray[np.floating] | float = 0.0,
     time_limit: float | None = None,
     max_duality_gap: float | None = None,
     model_output: str = '',
     debug: bool = False
-) -> tuple[npt.NDArray[np.float64], float]:
+) -> tuple[npt.NDArray[np.floating], float]:
     """
     Solve the standard genetic selection problem using Gurobi.
 
@@ -118,7 +118,7 @@ def gurobi_standard_genetics(
     )
 
     # set up the two sum-to-half constraints
-    M = np.zeros((2, dimension), dtype=int)
+    M = np.zeros((2, dimension), dtype=np.bool)
     # define the M so that column i is [1;0] if i is a sire and [0;1] otherwise
     M[0, sires] = 1
     M[1, dams] = 1
@@ -141,21 +141,21 @@ def gurobi_standard_genetics(
 
 
 def gurobi_robust_genetics(
-    sigma: npt.NDArray[np.float64] | sparse.spmatrix,
-    mubar: npt.NDArray[np.float64],
-    omega: npt.NDArray[np.float64] | sparse.spmatrix,
+    sigma: npt.NDArray[np.floating] | sparse.spmatrix,
+    mubar: npt.NDArray[np.floating],
+    omega: npt.NDArray[np.floating] | sparse.spmatrix,
     sires,  # type could be np.ndarray, sets[ints], lists[int], range, etc
     dams,   # type could be np.ndarray, sets[ints], lists[int], range, etc
     lam: float,  # cannot be called `lambda`, that's reserved in Python
     kappa: float,
     dimension: int,
-    upper_bound: npt.NDArray[np.float64] | float = 1.0,
-    lower_bound: npt.NDArray[np.float64] | float = 0.0,
+    upper_bound: npt.NDArray[np.floating] | float = 1.0,
+    lower_bound: npt.NDArray[np.floating] | float = 0.0,
     time_limit: float | None = None,
     max_duality_gap: float | None = None,
     model_output: str = '',
     debug: bool = False
-) -> tuple[npt.NDArray[np.float64], float, float]:
+) -> tuple[npt.NDArray[np.floating], float, float]:
     """
     Solve the robust genetic selection problem using Gurobi.
 
@@ -227,7 +227,7 @@ def gurobi_robust_genetics(
     ndarray
         Portfolio vector which Gurobi has determined is a solution.
     float
-        Auxillary variable corresponding to uncertainty associated with the
+        Auxiliary variable corresponding to uncertainty associated with the
         portfolio vector which Gurobi has determined is a solution.
     float
         Value of the objective function for returned solution vector.
@@ -254,12 +254,12 @@ def gurobi_robust_genetics(
     )
 
     # set up the two sum-to-half constraints
-    M = np.zeros((2, dimension), dtype=int)
+    M = np.zeros((2, dimension), dtype=np.bool)
     # define the M so that column i is [1;0] if i is a sire and [0;1] otherwise
     M[0, sires] = 1
     M[1, dams] = 1
     # define the right hand side of the constraint Mx = m
-    m = np.full(2, 0.5, dtype=float)
+    m = np.full(2, 0.5, dtype=np.floating)
     model.addConstr(M@w == m, name="sum-to-half")
 
     # conic constraint which comes from robust optimization
@@ -280,23 +280,23 @@ def gurobi_robust_genetics(
 
 
 def gurobi_robust_genetics_sqp(
-    sigma: npt.NDArray[np.float64] | sparse.spmatrix,
-    mubar: npt.NDArray[np.float64],
-    omega: npt.NDArray[np.float64] | sparse.spmatrix,
+    sigma: npt.NDArray[np.floating] | sparse.spmatrix,
+    mubar: npt.NDArray[np.floating],
+    omega: npt.NDArray[np.floating] | sparse.spmatrix,
     sires,  # type could be np.ndarray, sets[ints], lists[int], range, etc
     dams,   # type could be np.ndarray, sets[ints], lists[int], range, etc
     lam: float,  # cannot be called `lambda`, that's reserved in Python
     kappa: float,
     dimension: int,
-    upper_bound: npt.NDArray[np.float64] | float = 1.0,
-    lower_bound: npt.NDArray[np.float64] | float = 0.0,
+    upper_bound: npt.NDArray[np.floating] | float = 1.0,
+    lower_bound: npt.NDArray[np.floating] | float = 0.0,
     time_limit: float | None = None,
     max_duality_gap: float | None = None,
     max_iterations: int = 1000,
     robust_gap_tol: float = 1e-7,
     model_output: str = '',
     debug: bool = False
-) -> tuple[npt.NDArray[np.float64], float, float]:
+) -> tuple[npt.NDArray[np.floating], float, float]:
     """
     Solve the robust genetic selection problem using SQP in Gurobi.
 
@@ -403,12 +403,12 @@ def gurobi_robust_genetics_sqp(
     )
 
     # set up the two sum-to-half constraints
-    M = np.zeros((2, dimension), dtype=int)
+    M = np.zeros((2, dimension), dtype=np.bool)
     # define the M so that column i is [1;0] if i is a sire and [0;1] otherwise
     M[0, sires] = 1
     M[1, dams] = 1
     # define the right hand side of the constraint Mx = m
-    m = np.full(2, 0.5, dtype=float)
+    m = np.full(2, 0.5, dtype=np.floating)
     model.addConstr(M@w == m, name="sum-to-half")
 
     # optional controls to stop Gurobi taking too long
@@ -438,7 +438,7 @@ def gurobi_robust_genetics_sqp(
             print("No active constraints!")
 
         # z coefficient for the new constraint
-        w_star: npt.NDArray[np.float64] = np.array(w.X)
+        w_star: npt.NDArray[np.floating] = np.array(w.X)
         alpha: float = sqrt(w_star.transpose()@omega@w_star)
 
         # if gap between z and w'Omega w has converged, done
@@ -452,8 +452,8 @@ def gurobi_robust_genetics_sqp(
 
 
 def highs_bound_like(dimension: int,
-                     value: float | list[float] | npt.NDArray[np.float64]
-                     ):  # -> npt.NDArray[np.float64] | list[float] # BUG broke
+                     value: float | list[float] | npt.NDArray[np.floating]
+                     ):  # -> npt.NDArray[np.floating] | list[float] # BUG broke
     """
     Helper function which allows HiGHS to interpret variable bounds specified
     either as a vector or a single floating point value. If `value` is an array
@@ -466,18 +466,18 @@ def highs_bound_like(dimension: int,
 
 def highs_standard_genetics(
     sigma: sparse.spmatrix,
-    mu: npt.NDArray[np.float64],
+    mu: npt.NDArray[np.floating],
     sires,  # type could be np.ndarray, sets[ints], lists[int], range, etc
     dams,   # type could be np.ndarray, sets[ints], lists[int], range, etc
     lam: float,  # cannot be called `lambda`, that's reserved in Python
     dimension: int,
-    upper_bound: npt.NDArray[np.float64] | list[float] | float = 1.0,
-    lower_bound: npt.NDArray[np.float64] | list[float] | float = 0.0,
+    upper_bound: npt.NDArray[np.floating] | list[float] | float = 1.0,
+    lower_bound: npt.NDArray[np.floating] | list[float] | float = 0.0,
     time_limit: float | None = None,
     max_duality_gap: float | None = None,
     model_output: str = '',
     debug: bool = False
-) -> tuple[npt.NDArray[np.float64], float]:
+) -> tuple[npt.NDArray[np.floating], float]:
     """
     Solve the standard genetic selection problem using HiGHS.
 
@@ -557,7 +557,6 @@ def highs_standard_genetics(
     model.lp_.col_upper_ = highs_bound_like(dimension, upper_bound)
 
     # define the quadratic term in the objective
-    sigma = sparse.csc_matrix(sigma)  # BUG is sigma in CSR or CSC format?
     model.hessian_.dim_ = dimension
     model.hessian_.start_ = sigma.indptr
     model.hessian_.index_ = sigma.indices
@@ -609,7 +608,7 @@ def highs_standard_genetics(
         raise RuntimeError
 
     # by default, col_value is a stock-Python list
-    solution: npt.NDArray[np.float64] = np.array(h.getSolution().col_value)
+    solution: npt.NDArray[np.floating] = np.array(h.getSolution().col_value)
     # we negated the objective function, so negate it back
     objective_value: float = -h.getInfo().objective_function_value
 
@@ -618,22 +617,22 @@ def highs_standard_genetics(
 
 def highs_robust_genetics_sqp(
     sigma: sparse.spmatrix,
-    mubar: npt.NDArray[np.float64],
-    omega: npt.NDArray[np.float64] | sparse.spmatrix,
+    mubar: npt.NDArray[np.floating],
+    omega: npt.NDArray[np.floating] | sparse.spmatrix,
     sires,  # type could be np.ndarray, sets[ints], lists[int], range, etc
     dams,   # type could be np.ndarray, sets[ints], lists[int], range, etc
     lam: float,  # cannot be called `lambda`, that's reserved in Python
     kappa: float,
     dimension: int,
-    upper_bound: npt.NDArray[np.float64] | list[float] | float = 1.0,
-    lower_bound: npt.NDArray[np.float64] | list[float] | float = 0.0,
+    upper_bound: npt.NDArray[np.floating] | list[float] | float = 1.0,
+    lower_bound: npt.NDArray[np.floating] | list[float] | float = 0.0,
     time_limit: float | None = None,
     max_duality_gap: float | None = None,
     max_iterations: int = 1000,
     robust_gap_tol: float = 1e-7,
     model_output: str = '',
     debug: bool = False
-) -> tuple[npt.NDArray[np.float64], float, float]:
+) -> tuple[npt.NDArray[np.floating], float, float]:
     """
     Solve the robust genetic selection problem using SQP in HiGHS.
 
@@ -795,7 +794,7 @@ def highs_robust_genetics_sqp(
 
         # by default, col_value is a stock-Python list
         solution: list[float] = h.getSolution().col_value
-        w_star: npt.NDArray[np.float64] = np.array(solution[:-1])
+        w_star: npt.NDArray[np.floating] = np.array(solution[:-1])
         z_star: float = solution[-1]
 
         # we negated the objective function, so negate it back
@@ -825,7 +824,7 @@ def highs_robust_genetics_sqp(
         # add a new plane to the approximation of the uncertainty cone
         num_nz: int = dimension + 1  # HACK assuming entirely dense
         index: range = range(dimension + 1)
-        value: npt.NDArray[np.float64] = np.append(-omega@w_star, alpha)
+        value: npt.NDArray[np.floating] = np.append(-omega@w_star, alpha)
         h.addRow(0, inf, num_nz, index, value)
 
     # final value of solution is the z value, return separately
diff --git a/robustocs/utils.py b/robustocs/utils.py
index 4353ec1..42d5e50 100644
--- a/robustocs/utils.py
+++ b/robustocs/utils.py
@@ -15,8 +15,8 @@
 
 
 def print_compare_solutions(
-    portfolio1: npt.NDArray[np.float64],
-    portfolio2: npt.NDArray[np.float64],
+    portfolio1: npt.NDArray[np.floating],
+    portfolio2: npt.NDArray[np.floating],
     objective1: float,
     objective2: float,
     precision: int = 5,
@@ -106,8 +106,8 @@ def obj_string(name: str, value: float, precision: int,
 
 def check_uncertainty_constraint(
     z: float,
-    w: npt.NDArray[np.float64],
-    omega: npt.NDArray[np.float64] | sparse.spmatrix,
+    w: npt.NDArray[np.floating],
+    omega: npt.NDArray[np.floating] | sparse.spmatrix,
     tol: float = 1e-7,
     debug: bool = False
 ) -> bool: