From 636ba20ec3303f04c96ab0780e7a1095044ad40a Mon Sep 17 00:00:00 2001
From: Josh Fogg <j.fogg@ed.ac.uk>
Date: Thu, 25 Jul 2024 17:49:33 +0100
Subject: [PATCH] Added functions for loading sex data (#23)

* Added `load_sexes` (fixes #18)
* Expanded `load_problem` to support sex data
* Made `load_problem` robust to optional arguments
* Updated examples and tests to reflect changes
---
 example.py            |  16 ++--
 examples/04/test.py   |  11 +--
 examples/1000/test.py |   9 +-
 examples/50/test.py   |   9 +-
 robustocs/loaders.py  | 201 +++++++++++++++++++++++++++++++-----------
 5 files changed, 171 insertions(+), 75 deletions(-)

diff --git a/example.py b/example.py
index 319c0ce..8a9a5c8 100644
--- a/example.py
+++ b/example.py
@@ -3,19 +3,15 @@
 import robustocs
 
 # key problem variables loaded from standard format txt files
-sigma, mubar, omega, n = robustocs.load_problem(
-    "examples/04/A04.txt",
-    "examples/04/EBV04.txt",
-    "examples/04/S04.txt",
+sigma, mubar, omega, n, sires, dams, names = robustocs.load_problem(
+    sigma_filename="examples/04/A04.txt",
+    mu_filename="examples/04/EBV04.txt",
+    omega_filename="examples/04/S04.txt",
+    sex_filename="examples/04/SEX04.txt",
     issparse=True
 )
 
-# NOTE this trick of handling sex data is specific to the initial simulation
-# data which is structured so that candidates alternate between sires (which
-# have even indices) and dams (which have odd indices).
-sires = range(0, n, 2)
-dams = range(1, n, 2)
-
+# parameters with which to solve the problem
 lam = 0.5
 kap = 1
 
diff --git a/examples/04/test.py b/examples/04/test.py
index 5813964..072b9e5 100644
--- a/examples/04/test.py
+++ b/examples/04/test.py
@@ -6,13 +6,14 @@
 # SETUP
 # -----
 
-# load in the problem variables
-sigma, mubar, omega, n = rocs.load_problem(
-    "A04.txt",
-    "EBV04.txt",
-    "S04.txt",
+# load in the problem variables (using underscore for those dropped)
+sigma, mubar, omega, n, _, _, _ = rocs.load_problem(
+    sigma_filename="A04.txt",
+    mu_filename="EBV04.txt",
+    omega_filename="S04.txt",
     issparse=True
 )
+
 sires = range(0, n, 2)
 dams = range(1, n, 2)
 lam = 0.5
diff --git a/examples/1000/test.py b/examples/1000/test.py
index 2772fb8..99561a3 100644
--- a/examples/1000/test.py
+++ b/examples/1000/test.py
@@ -7,12 +7,13 @@
 # -----
 
 # load in the problem variables
-sigma, mubar, omega, n = rocs.load_problem(
-    "A1000.txt",
-    "EBV1000.txt",
-    "S1000.txt",
+sigma, mubar, omega, n, _, _, _ = rocs.load_problem(
+    sigma_filename="A1000.txt",
+    mu_filename="EBV1000.txt",
+    omega_filename="S1000.txt",
     issparse=True
 )
+
 sires = range(0, n, 2)
 dams = range(1, n, 2)
 lam = 0.5
diff --git a/examples/50/test.py b/examples/50/test.py
index ff8c3e1..d166706 100644
--- a/examples/50/test.py
+++ b/examples/50/test.py
@@ -7,12 +7,13 @@
 # -----
 
 # load in the problem variables
-sigma, mubar, omega, n = rocs.load_problem(
-    "A50.txt",
-    "EBV50.txt",
-    "S50.txt",
+sigma, mubar, omega, n, _, _, _ = rocs.load_problem(
+    sigma_filename="A50.txt",
+    mu_filename="EBV50.txt",
+    omega_filename="S50.txt",
     issparse=True
 )
+
 sires = range(0, n, 2)
 dams = range(1, n, 2)
 lam = 0.5
diff --git a/robustocs/loaders.py b/robustocs/loaders.py
index ac8cce4..210876b 100644
--- a/robustocs/loaders.py
+++ b/robustocs/loaders.py
@@ -146,6 +146,61 @@ def load_sparse_symmetric_matrix(filename: str, dimension: int, nnz: int,
         raise ValueError("Format must be 'coo' or 'csr'.")
 
 
+def load_sexes(filename: str, dimension: int) -> tuple[
+    npt.NDArray[np.unsignedinteger],
+    npt.NDArray[np.unsignedinteger],
+    npt.NDArray[np.str_]
+]:
+    """
+    Function for loading cohort sex data from file. Uniquely among the
+    file formats we're working with, the format for sex data also includes
+    a label for each candidate in the cohort, so also returns those.
+
+    Parameters
+    ----------
+    filename : str
+        Filename, including extension. Space-separated data file.
+    dimension : int
+        Number of rows of data stored in the file
+
+    Returns
+    -------
+    ndarray
+        Array of indices for sires in the cohort.
+    ndarray
+        Array of indices for dams in the cohort.
+    ndarray
+        Array of labels for each candidate in the cohort.
+    """
+
+    # preallocate output vectors
+    sires = np.zeros((dimension,), dtype=np.unsignedinteger)
+    dams = np.zeros((dimension,), dtype=np.unsignedinteger)
+    names = np.zeros((dimension,), dtype=np.str_)
+
+    # index trackers
+    sire_count: int = 0
+    dam_count: int = 0
+
+    with open(filename, 'r') as file:
+        for line in file:
+            name, sex = line.strip().split(" ")
+
+            if sex == "M":
+                sires[sire_count] = sire_count + dam_count
+                sire_count += 1
+            elif sex == "F":
+                dams[dam_count] = sire_count + dam_count
+                dam_count += 1
+            else:
+                raise ValueError(f"invalid sex input {sex}")
+
+            names[sire_count + dam_count - 1] = name
+
+    # cull the entries of the index vectors which weren't used
+    return sires[:sire_count], dams[:dam_count], names
+
+
 def load_ped(filename: str) -> dict[int, list[int]]:
     """
     Function for reading pedigree files into a Python dictionary.
@@ -220,87 +275,129 @@ def makeA(pedigree: dict[int, list[int]]) -> npt.NDArray[np.floating]:
 # problems into Python. At the moment this is through a single function,
 # though may need to be split if more features are added.
 
-def load_problem(A_filename: str, E_filename: str, S_filename: str,
-                 nnzA: int | None = None, nnzS: int | None = None,
-                 dimension: int | None = None, pedigree: bool = False,
-                 issparse: bool = False
-                 ) -> tuple[npt.NDArray[np.floating] | sparse.spmatrix,
-                            npt.NDArray[np.floating],
-                            npt.NDArray[np.floating] | sparse.spmatrix,
-                            int]:
+def load_problem(
+    sigma_filename: str,
+    mu_filename: str,
+    omega_filename: str | None = None,
+    sex_filename: str | None = None,
+    nnz_sigma: int | None = None,
+    nnz_omega: int | None = None,
+    dimension: int | None = None,
+    pedigree: bool = False,
+    issparse: bool = False
+) -> tuple[
+    npt.NDArray[np.floating] | sparse.spmatrix,
+    npt.NDArray[np.floating],
+    npt.NDArray[np.floating] | sparse.spmatrix | None,
+    int,
+    npt.NDArray[np.unsignedinteger] | None,
+    npt.NDArray[np.unsignedinteger] | None,
+    npt.NDArray[np.str_] | None
+]:
     """
     Load a robust genetic selection problem into Python.
 
     Parameters
     ----------
-    A_filename : str
-        Filename for a file which encodes `A` (which is Sigma) whether in
-        sparse coordinate or pedigree format.
-    E_filename : str
-        Filename for a file which encodes `E` (which is Mu-bar).
-    S_filename : str
-        Filename for a file which encodes `S` (which is Omega) which will be
-        in sparse coordinate format.
-    nnzA : int, optional
-        Number of non-zeros in the matrix A. If not provided and `sparse` is
+    sigma_filename : str
+        Filename for a file which encodes sigma, whether that's in sparse
+        coordinate format or pedigree format.
+    mu_filename : str
+        Filename for a file which encodes mu or mubar.
+    omega_filename : str, optional
+        Filename for a file which encodes omega, which will be in sparse
+        coordinate format. Default value is `None`.
+    sex_filename : str, optional
+        Filename for a file which encodes the sex data for the cohort. Also
+        includes a label for each candidate. Default value is `None`.
+    nnz_sigma : int, optional
+        Number of non-zeros in sigma. If not provided and `sparse` is
         True, value computed using `count_sparse_nnz`. Default is `None`.
-    nnzS : int, optional
-        Number of non-zeros in the matrix S. If not provided and `sparse` is
+    nnz_omega : int, optional
+        Number of non-zeros in omega. If not provided and `sparse` is
         True, value computed using `count_sparse_nnz`. Default is `None`.
-    dimension : int or None, optional
+    dimension : int, optional
         The size of the problem, which can be specified to aid preallocation
-        or worked out explicitly from the `E` / mu produced. Default value
-        is `None`, i.e. the value is derived from `E`.
+        or worked out explicitly from the mu / mubar produced. Default value
+        is `None`, i.e. the value is derived from mu.
     pedigree : bool, optional
-        Signifies whether `A` is stored as a pedigree structure (`True`)
+        Signifies whether sigma is stored as a pedigree structure (`True`)
         or in sparse coordinate format (`False`). Default value is `False`.
     issparse : bool, optional
-        Signifies whether `A` and `S` should be returned in compressed sparse
-        row format. Default value is `False`.
+        Signifies whether sigma and omega should be returned in compressed
+        sparse row format. Default value is `False`.
 
     Returns
     -------
     ndarray or spmatrix
-        Covariance matrix of candidates in the cohort.
+        Covariance matrix of candidates in the cohort (sigma).
     ndarray
         Vector of expected values of the expected breeding values of
-        candidates in the cohort.
-    ndarray or spmatrix
+        candidates in the cohort (mu).
+    ndarray, spmatrix, or None
         Covariance matrix of expected breeding values of candidates in the
-        cohort.
+        cohort (omega). If a filename was not provided, value is `None`.
     int
-        Dimension of the problem.
+        Dimension of the problem (n).
+    ndarray or None
+        Array of indices of sires in the cohort (S). If a filename for sex data
+        was not provided, value is `None`.
+    ndarray or None
+        Array of indices of dams in the cohort (D). If a filename for sex data
+        was not provided, value is `None`.
+    ndarray or None
+        Array of names given to the original candidates in the cohort. If a
+        filename for sex data was not provided (which also includes name data),
+        then the value is `None`.
     """
 
-    E = np.loadtxt(E_filename, dtype=np.floating)
-    # if dimension not specified, use `E` which doesn't need preallocation
+    mu: npt.NDArray[np.floating] = np.loadtxt(mu_filename, dtype=np.floating)
+    # if dimension not specified, use `mu` which doesn't need preallocation
     if not dimension:
-        assert isinstance(E.size, int)  # catches E being empty
-        dimension = E.size
-
-    # can load S from coordinates to SciPy's CSR or NumPy's dense format
-    if issparse:
-        if not nnzS:
-            nnzS = count_sparse_nnz(S_filename)
-        S = load_sparse_symmetric_matrix(S_filename, dimension, nnzS)
+        assert isinstance(mu.size, int)  # catches mu being empty
+        dimension = mu.size
+
+    # filename for sex data is optional, so skip if not provided
+    if sex_filename is not None:
+        sires, dams, names = load_sexes(sex_filename, dimension)
     else:
-        # if nnzS was defined here, it's ignored as a parameter
-        S = load_symmetric_matrix(S_filename, dimension)
+        sires = dams = names = None
+
+    # filename for omega is optional, so skip if not provided
+    if omega_filename is not None:
+        # can load omega from file to SciPy's CSR or NumPy's dense format
+        if issparse:
+            # find omega's number of non zeros if it wasn't given
+            if not nnz_omega:
+                nnz_omega = count_sparse_nnz(omega_filename)
+            omega: sparse.spmatrix = load_sparse_symmetric_matrix(
+                omega_filename, dimension, nnz_omega
+            )
+        else:
+            # if nnz_omega was defined, it's ignored as a parameter
+            omega: npt.NDArray[np.floating] = load_symmetric_matrix(
+                omega_filename, dimension
+            )
 
-    # A can be stored as a pedigree or by coordinates and can be loaded to
+    # sigma can be stored as a pedigree or by coordinates and can be loaded to
     # SciPy's CSR or Numpy's dense format. Hence have four branches below.
     if pedigree:
-        A = makeA(load_ped(A_filename))
+        sigma: npt.NDArray[np.floating] = makeA(load_ped(sigma_filename))
         # HACK this loads the full matrix, then converts it down to sparse
         if issparse:
-            A = sparse.coo_matrix(A)
+            sigma: sparse.spmatrix = sparse.coo_matrix(sigma)
     else:
         if issparse:
-            if not nnzA:
-                nnzA = count_sparse_nnz(A_filename)
-            A = load_sparse_symmetric_matrix(A_filename, dimension, nnzA)
+            # find sigma's number of non zeros if it wasn't given
+            if not nnz_sigma:
+                nnz_sigma = count_sparse_nnz(sigma_filename)
+            sigma: sparse.spmatrix = load_sparse_symmetric_matrix(
+                sigma_filename, dimension, nnz_sigma
+            )
         else:
-            # if nnzA was defined here, it's ignored as a parameter
-            A = load_symmetric_matrix(A_filename, dimension)
+            # if nnz_sigma was defined here, it's ignored as a parameter
+            sigma: npt.NDArray[np.floating] = load_symmetric_matrix(
+                sigma_filename, dimension
+            )
 
-    return A, E, S, dimension
+    return sigma, mu, omega, dimension, sires, dams, names