Merge pull request #115 from mmcdermott/update_packages

Some small changes to update things for more recent versions of packages.
mmcdermott · Jun 22, 2024 · ce9e2c3 · ce9e2c3
2 parents e77b5c9 + 5b4a279
commit ce9e2c3
Show file tree

Hide file tree

Showing 5 changed files with 43 additions and 43 deletions.
diff --git a/EventStream/data/README.md b/EventStream/data/README.md
@@ -76,8 +76,8 @@ the following data:
    indices of the measures that correspond to the measurement observations in `dynamic_indices`.
 8. `dynamic_values`, which is of the same (ragged) shape as `dynamic_indices` and contains any unique
    numerical values associated with those measurements. Items may be missing (reflected with `None` or
-   `np.NaN`, depending on the data library format) or may have been filtered out as outliers (reflected with
-   `np.NaN`).
+   `float('nan')`, depending on the data library format) or may have been filtered out as outliers (reflected with
+   `float('nan')`).
 
 ### Measurements
 
@@ -390,7 +390,7 @@ Let us define the following variables:
 }
 ```
 
-`static_data_values` and `data_values` in the above dictionary may contain `np.NaN` entries where values were
+`static_data_values` and `data_values` in the above dictionary may contain `float('nan')` entries where values were
 not observed with a given data element. All other data elements are fully observed. The elements correspond to
 the following kinds of features:
 

diff --git a/EventStream/data/dataset_polars.py b/EventStream/data/dataset_polars.py
@@ -16,7 +16,6 @@
 from pathlib import Path
 from typing import Any, Union
 
-import numpy as np
 import pandas as pd
 import polars as pl
 import polars.selectors as cs
@@ -432,28 +431,28 @@ def drop_or_censor(
         censor_upper_bound: pl.Expr | None = None,
         **ignored_kwargs,
     ) -> pl.Expr:
-        """Appropriately either drops (returns np.NaN) or censors (returns the censor value) the value `val`
-        based on the bounds in `row`.
+        """Appropriately either drops (returns float('nan')) or censors (returns the censor value) the value
+        `val` based on the bounds in `row`.
 
         TODO(mmd): could move this code to an outlier model in Preprocessing and have it be one that is
         pre-set in metadata.
 
         Args:
             val: The value to drop, censor, or return unchanged.
             drop_lower_bound: A lower bound such that if `val` is either below or at or below this level,
-                `np.NaN` will be returned. If `None` or `np.NaN`, no bound will be applied.
-            drop_lower_bound_inclusive: If `True`, returns `np.NaN` if ``val <= row['drop_lower_bound']``.
-                Else, returns `np.NaN` if ``val < row['drop_lower_bound']``.
+                `float('nan')` will be returned. If `None` or `float('nan')`, no bound will be applied.
+            drop_lower_bound_inclusive: If `True`, returns `float('nan')` if ``val <=
+                row['drop_lower_bound']``. Else, returns `float('nan')` if ``val < row['drop_lower_bound']``.
             drop_upper_bound: An upper bound such that if `val` is either above or at or above this level,
-                `np.NaN` will be returned. If `None` or `np.NaN`, no bound will be applied.
-            drop_upper_bound_inclusive: If `True`, returns `np.NaN` if ``val >= row['drop_upper_bound']``.
-                Else, returns `np.NaN` if ``val > row['drop_upper_bound']``.
+                `float('nan')` will be returned. If `None` or `float('nan')`, no bound will be applied.
+            drop_upper_bound_inclusive: If `True`, returns `float('nan')` if ``val >=
+                row['drop_upper_bound']``. Else, returns `float('nan')` if ``val > row['drop_upper_bound']``.
             censor_lower_bound: A lower bound such that if `val` is below this level but above
-                `drop_lower_bound`, `censor_lower_bound` will be returned. If `None` or `np.NaN`, no bound
-                will be applied.
+                `drop_lower_bound`, `censor_lower_bound` will be returned. If `None` or `float('nan')`, no
+                bound will be applied.
             censor_upper_bound: An upper bound such that if `val` is above this level but below
-                `drop_upper_bound`, `censor_upper_bound` will be returned. If `None` or `np.NaN`, no bound
-                will be applied.
+                `drop_upper_bound`, `censor_upper_bound` will be returned. If `None` or `float('nan')`, no
+                bound will be applied.
         """
 
         conditions = []
@@ -462,15 +461,15 @@ def drop_or_censor(
             conditions.append(
                 (
                     (col < drop_lower_bound) | ((col == drop_lower_bound) & drop_lower_bound_inclusive),
-                    np.NaN,
+                    float("nan"),
                 )
             )
 
         if drop_upper_bound is not None:
             conditions.append(
                 (
                     (col > drop_upper_bound) | ((col == drop_upper_bound) & drop_upper_bound_inclusive),
-                    np.NaN,
+                    float("nan"),
                 )
             )
 
@@ -1179,7 +1178,7 @@ def _transform_numerical_measurement(
                     ]
                 )
             )
-            .then(np.NaN)
+            .then(float("nan"))
             .when(value_type == NumericDataModalitySubtype.INTEGER)
             .then(vals_col.round(0))
             .otherwise(vals_col)
@@ -1203,7 +1202,7 @@ def _transform_numerical_measurement(
             inliers_col = ((vals_col > pl.col("thresh_small")) & (vals_col < pl.col("thresh_large"))).alias(
                 inliers_col_name
             )
-            vals_col = pl.when(inliers_col).then(vals_col).otherwise(np.NaN)
+            vals_col = pl.when(inliers_col).then(vals_col).otherwise(float("nan"))
 
             present_source = present_source.with_columns(inliers_col, vals_col)
             null_source = null_source.with_columns(pl.lit(None).cast(pl.Boolean).alias(inliers_col_name))
@@ -1241,7 +1240,7 @@ def _transform_categorical_measurement(
         if config.modality == DataModality.MULTIVARIATE_REGRESSION:
             transform_expr.append(
                 pl.when(~pl.col(measure).is_in(config.vocabulary.vocabulary))
-                .then(np.NaN)
+                .then(float("nan"))
                 .otherwise(pl.col(config.values_column))
                 .alias(config.values_column)
             )

diff --git a/EventStream/data/pytorch_dataset.py b/EventStream/data/pytorch_dataset.py
@@ -36,25 +36,25 @@ def to_int_index(col: pl.Expr) -> pl.Expr:
         ...     'c': ['foo', 'bar', 'foo', 'bar', 'baz', None, 'bar', 'aba'],
         ...     'd': [1, 2, 3, 4, 5, 6, 7, 8]
         ... })
-        >>> X.with_columns(to_int_index(pl.col('c')))
-        shape: (8, 2)
-        ┌──────┬─────┐
-        │ c    ┆ d   │
-        │ ---  ┆ --- │
-        │ u32  ┆ i64 │
-        ╞══════╪═════╡
-        │ 4    ┆ 1   │
-        │ 1    ┆ 2   │
-        │ 4    ┆ 3   │
-        │ 1    ┆ 4   │
-        │ 2    ┆ 5   │
-        │ null ┆ 6   │
-        │ 1    ┆ 7   │
-        │ 0    ┆ 8   │
-        └──────┴─────┘
+        >>> X.with_columns(to_int_index(pl.col('c')).alias("c_index"))
+        shape: (8, 3)
+        ┌──────┬─────┬─────────┐
+        │ c    ┆ d   ┆ c_index │
+        │ ---  ┆ --- ┆ ---     │
+        │ str  ┆ i64 ┆ u32     │
+        ╞══════╪═════╪═════════╡
+        │ foo  ┆ 1   ┆ 3       │
+        │ bar  ┆ 2   ┆ 1       │
+        │ foo  ┆ 3   ┆ 3       │
+        │ bar  ┆ 4   ┆ 1       │
+        │ baz  ┆ 5   ┆ 2       │
+        │ null ┆ 6   ┆ null    │
+        │ bar  ┆ 7   ┆ 1       │
+        │ aba  ┆ 8   ┆ 0       │
+        └──────┴─────┴─────────┘
     """
 
-    indices = col.unique(maintain_order=True).drop_nulls().search_sorted(col)
+    indices = col.drop_nulls().unique().sort().search_sorted(col, side="left")
     return pl.when(col.is_null()).then(pl.lit(None)).otherwise(indices).alias(col.meta.output_name())
 
 
@@ -442,7 +442,7 @@ def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
            unified vocabulary space spanning all metadata vocabularies.
         3. ``dynamic_values`` captures the numerical metadata elements listed in `self.data_cols`. If no
            numerical elements are listed in `self.data_cols` for a given categorical column, the according
-           index in this output will be `np.NaN`.
+           index in this output will be `float('nan')`.
         4. ``dynamic_measurement_indices`` captures which measurement vocabulary was used to source a given
            data element.
         5. ``static_indices`` captures the categorical metadata elements listed in `self.static_cols` in a

diff --git a/pyproject.toml b/pyproject.toml
@@ -18,7 +18,7 @@ packages = [
 
 [tool.poetry.dependencies]
 python = ">=3.10,<3.13"
-polars = "^0.20.26"
+polars = "^0.20.31"
 plotly = "^5.16.1"
 ml-mixins = "^0.0.5"
 humanize = "^4.8.0"
@@ -41,11 +41,12 @@ scikit-learn = "^1.3.0"
 rootutils = "^1.0.7"
 loguru = "^0.7.2"
 nested-ragged-tensors = "^0.0.6"
+numpy = "^1.26.4"
 
 # Test dependencies
 pexpect = { version="^4.8.0", optional=true }
 pytest = { version="^7.4.0", optional=true }
-pytest-cov = {extras = ["toml"], version = "^4.1.0", optional=true}
+pytest-cov = { version = "^4.1.0", optional=true}
 nbmake = { version="^1.4.3", optional=true }
 pre-commit = { version="^3.3.3", optional=true}
 pytest-subtests = { version="^0.11.0", optional=true}

diff --git a/tests/data/test_pytorch_dataset.py b/tests/data/test_pytorch_dataset.py
@@ -179,7 +179,7 @@
             [],
         ],
         "dynamic_values": [
-            [[None, None, None, None], [None, 0.1, 0.3, 1.2], [None, np.NaN], [None]],
+            [[None, None, None, None], [None, 0.1, 0.3, 1.2], [None, float("nan")], [None]],
             [[None], [None, 0.2]],
             [[None], [None, None], [None]],
             [],
@@ -241,7 +241,7 @@
         [MEASUREMENTS_IDXMAP["event_type"], MEASUREMENTS_IDXMAP["multivariate_regression"]],
         [MEASUREMENTS_IDXMAP["event_type"]],
     ],
-    "dynamic_values": [[None, None, None, None], [None, 0.1, 0.3, 1.2], [None, np.NaN], [None]],
+    "dynamic_values": [[None, None, None, None], [None, 0.1, 0.3, 1.2], [None, float("nan")], [None]],
 }
 
 WANT_SUBJ_2_UNCUT = {