Skip to content

Commit

Permalink
Merge pull request #115 from mmcdermott/update_packages
Browse files Browse the repository at this point in the history
Some small changes to update things for more recent versions of packages.
  • Loading branch information
mmcdermott authored Jun 22, 2024
2 parents e77b5c9 + 5b4a279 commit ce9e2c3
Show file tree
Hide file tree
Showing 5 changed files with 43 additions and 43 deletions.
6 changes: 3 additions & 3 deletions EventStream/data/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,8 @@ the following data:
indices of the measures that correspond to the measurement observations in `dynamic_indices`.
8. `dynamic_values`, which is of the same (ragged) shape as `dynamic_indices` and contains any unique
numerical values associated with those measurements. Items may be missing (reflected with `None` or
`np.NaN`, depending on the data library format) or may have been filtered out as outliers (reflected with
`np.NaN`).
`float('nan')`, depending on the data library format) or may have been filtered out as outliers (reflected with
`float('nan')`).

### Measurements

Expand Down Expand Up @@ -390,7 +390,7 @@ Let us define the following variables:
}
```

`static_data_values` and `data_values` in the above dictionary may contain `np.NaN` entries where values were
`static_data_values` and `data_values` in the above dictionary may contain `float('nan')` entries where values were
not observed with a given data element. All other data elements are fully observed. The elements correspond to
the following kinds of features:

Expand Down
35 changes: 17 additions & 18 deletions EventStream/data/dataset_polars.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
from pathlib import Path
from typing import Any, Union

import numpy as np
import pandas as pd
import polars as pl
import polars.selectors as cs
Expand Down Expand Up @@ -432,28 +431,28 @@ def drop_or_censor(
censor_upper_bound: pl.Expr | None = None,
**ignored_kwargs,
) -> pl.Expr:
"""Appropriately either drops (returns np.NaN) or censors (returns the censor value) the value `val`
based on the bounds in `row`.
"""Appropriately either drops (returns float('nan')) or censors (returns the censor value) the value
`val` based on the bounds in `row`.
TODO(mmd): could move this code to an outlier model in Preprocessing and have it be one that is
pre-set in metadata.
Args:
val: The value to drop, censor, or return unchanged.
drop_lower_bound: A lower bound such that if `val` is either below or at or below this level,
`np.NaN` will be returned. If `None` or `np.NaN`, no bound will be applied.
drop_lower_bound_inclusive: If `True`, returns `np.NaN` if ``val <= row['drop_lower_bound']``.
Else, returns `np.NaN` if ``val < row['drop_lower_bound']``.
`float('nan')` will be returned. If `None` or `float('nan')`, no bound will be applied.
drop_lower_bound_inclusive: If `True`, returns `float('nan')` if ``val <=
row['drop_lower_bound']``. Else, returns `float('nan')` if ``val < row['drop_lower_bound']``.
drop_upper_bound: An upper bound such that if `val` is either above or at or above this level,
`np.NaN` will be returned. If `None` or `np.NaN`, no bound will be applied.
drop_upper_bound_inclusive: If `True`, returns `np.NaN` if ``val >= row['drop_upper_bound']``.
Else, returns `np.NaN` if ``val > row['drop_upper_bound']``.
`float('nan')` will be returned. If `None` or `float('nan')`, no bound will be applied.
drop_upper_bound_inclusive: If `True`, returns `float('nan')` if ``val >=
row['drop_upper_bound']``. Else, returns `float('nan')` if ``val > row['drop_upper_bound']``.
censor_lower_bound: A lower bound such that if `val` is below this level but above
`drop_lower_bound`, `censor_lower_bound` will be returned. If `None` or `np.NaN`, no bound
will be applied.
`drop_lower_bound`, `censor_lower_bound` will be returned. If `None` or `float('nan')`, no
bound will be applied.
censor_upper_bound: An upper bound such that if `val` is above this level but below
`drop_upper_bound`, `censor_upper_bound` will be returned. If `None` or `np.NaN`, no bound
will be applied.
`drop_upper_bound`, `censor_upper_bound` will be returned. If `None` or `float('nan')`, no
bound will be applied.
"""

conditions = []
Expand All @@ -462,15 +461,15 @@ def drop_or_censor(
conditions.append(
(
(col < drop_lower_bound) | ((col == drop_lower_bound) & drop_lower_bound_inclusive),
np.NaN,
float("nan"),
)
)

if drop_upper_bound is not None:
conditions.append(
(
(col > drop_upper_bound) | ((col == drop_upper_bound) & drop_upper_bound_inclusive),
np.NaN,
float("nan"),
)
)

Expand Down Expand Up @@ -1179,7 +1178,7 @@ def _transform_numerical_measurement(
]
)
)
.then(np.NaN)
.then(float("nan"))
.when(value_type == NumericDataModalitySubtype.INTEGER)
.then(vals_col.round(0))
.otherwise(vals_col)
Expand All @@ -1203,7 +1202,7 @@ def _transform_numerical_measurement(
inliers_col = ((vals_col > pl.col("thresh_small")) & (vals_col < pl.col("thresh_large"))).alias(
inliers_col_name
)
vals_col = pl.when(inliers_col).then(vals_col).otherwise(np.NaN)
vals_col = pl.when(inliers_col).then(vals_col).otherwise(float("nan"))

present_source = present_source.with_columns(inliers_col, vals_col)
null_source = null_source.with_columns(pl.lit(None).cast(pl.Boolean).alias(inliers_col_name))
Expand Down Expand Up @@ -1241,7 +1240,7 @@ def _transform_categorical_measurement(
if config.modality == DataModality.MULTIVARIATE_REGRESSION:
transform_expr.append(
pl.when(~pl.col(measure).is_in(config.vocabulary.vocabulary))
.then(np.NaN)
.then(float("nan"))
.otherwise(pl.col(config.values_column))
.alias(config.values_column)
)
Expand Down
36 changes: 18 additions & 18 deletions EventStream/data/pytorch_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,25 +36,25 @@ def to_int_index(col: pl.Expr) -> pl.Expr:
... 'c': ['foo', 'bar', 'foo', 'bar', 'baz', None, 'bar', 'aba'],
... 'd': [1, 2, 3, 4, 5, 6, 7, 8]
... })
>>> X.with_columns(to_int_index(pl.col('c')))
shape: (8, 2)
┌──────┬─────┐
│ c ┆ d │
│ --- ┆ --- │
u32 ┆ i64 │
╞══════╪═════╡
4 ┆ 1
1 ┆ 2
4 ┆ 3 │
1 ┆ 4
2 ┆ 5
│ null ┆ 6 │
1 ┆ 7
0 ┆ 8
└──────┴─────┘
>>> X.with_columns(to_int_index(pl.col('c')).alias("c_index"))
shape: (8, 3)
┌──────┬─────┬─────────
│ c ┆ d ┆ c_index
│ --- ┆ --- ┆ ---
str ┆ i64 ┆ u32
╞══════╪═════╪═════════
foo ┆ 1 ┆ 3
bar ┆ 2 ┆ 1
foo ┆ 3 ┆ 3
bar ┆ 4 ┆ 1
baz ┆ 5 ┆ 2
│ null ┆ 6 ┆ null
bar ┆ 7 ┆ 1
aba ┆ 8 ┆ 0
└──────┴─────┴─────────
"""

indices = col.unique(maintain_order=True).drop_nulls().search_sorted(col)
indices = col.drop_nulls().unique().sort().search_sorted(col, side="left")
return pl.when(col.is_null()).then(pl.lit(None)).otherwise(indices).alias(col.meta.output_name())


Expand Down Expand Up @@ -442,7 +442,7 @@ def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
unified vocabulary space spanning all metadata vocabularies.
3. ``dynamic_values`` captures the numerical metadata elements listed in `self.data_cols`. If no
numerical elements are listed in `self.data_cols` for a given categorical column, the according
index in this output will be `np.NaN`.
index in this output will be `float('nan')`.
4. ``dynamic_measurement_indices`` captures which measurement vocabulary was used to source a given
data element.
5. ``static_indices`` captures the categorical metadata elements listed in `self.static_cols` in a
Expand Down
5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ packages = [

[tool.poetry.dependencies]
python = ">=3.10,<3.13"
polars = "^0.20.26"
polars = "^0.20.31"
plotly = "^5.16.1"
ml-mixins = "^0.0.5"
humanize = "^4.8.0"
Expand All @@ -41,11 +41,12 @@ scikit-learn = "^1.3.0"
rootutils = "^1.0.7"
loguru = "^0.7.2"
nested-ragged-tensors = "^0.0.6"
numpy = "^1.26.4"

# Test dependencies
pexpect = { version="^4.8.0", optional=true }
pytest = { version="^7.4.0", optional=true }
pytest-cov = {extras = ["toml"], version = "^4.1.0", optional=true}
pytest-cov = { version = "^4.1.0", optional=true}
nbmake = { version="^1.4.3", optional=true }
pre-commit = { version="^3.3.3", optional=true}
pytest-subtests = { version="^0.11.0", optional=true}
Expand Down
4 changes: 2 additions & 2 deletions tests/data/test_pytorch_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@
[],
],
"dynamic_values": [
[[None, None, None, None], [None, 0.1, 0.3, 1.2], [None, np.NaN], [None]],
[[None, None, None, None], [None, 0.1, 0.3, 1.2], [None, float("nan")], [None]],
[[None], [None, 0.2]],
[[None], [None, None], [None]],
[],
Expand Down Expand Up @@ -241,7 +241,7 @@
[MEASUREMENTS_IDXMAP["event_type"], MEASUREMENTS_IDXMAP["multivariate_regression"]],
[MEASUREMENTS_IDXMAP["event_type"]],
],
"dynamic_values": [[None, None, None, None], [None, 0.1, 0.3, 1.2], [None, np.NaN], [None]],
"dynamic_values": [[None, None, None, None], [None, 0.1, 0.3, 1.2], [None, float("nan")], [None]],
}

WANT_SUBJ_2_UNCUT = {
Expand Down

0 comments on commit ce9e2c3

Please sign in to comment.