From 99583502e0b97a30e0f974d05e4f1836e17c77e9 Mon Sep 17 00:00:00 2001 From: Joachim Moeyens Date: Tue, 8 Oct 2024 13:26:14 -0700 Subject: [PATCH] Use quivr's drop_duplicates --- src/thor/clusters.py | 1 - src/thor/orbit_determination/fitted_orbits.py | 4 +- src/thor/utils/__init__.py | 1 - src/thor/utils/quivr.py | 54 ------------------- 4 files changed, 1 insertion(+), 59 deletions(-) delete mode 100644 src/thor/utils/quivr.py diff --git a/src/thor/clusters.py b/src/thor/clusters.py index 3ad64df3..ab0bdea8 100644 --- a/src/thor/clusters.py +++ b/src/thor/clusters.py @@ -49,7 +49,6 @@ def hash_obs_ids(obs_ids: List[str]) -> str: def drop_duplicate_clusters( clusters: "Clusters", cluster_members: "ClusterMembers", - num_cpus: int = 1, ) -> Tuple["Clusters", "ClusterMembers"]: """ Drop clusters that have identical sets of observation IDs. diff --git a/src/thor/orbit_determination/fitted_orbits.py b/src/thor/orbit_determination/fitted_orbits.py index 37306733..b0050b9d 100644 --- a/src/thor/orbit_determination/fitted_orbits.py +++ b/src/thor/orbit_determination/fitted_orbits.py @@ -8,8 +8,6 @@ from adam_core.coordinates.residuals import Residuals from adam_core.orbits import Orbits -from ..utils.quivr import drop_duplicates - __all__ = [ "FittedOrbits", "FittedOrbitMembers", @@ -124,7 +122,7 @@ def drop_duplicate_orbits( "coordinates.vz", ] - filtered = drop_duplicates(orbits, subset=subset, keep=keep) + filtered = orbits.drop_duplicates(subset=subset, keep=keep) filtered_orbit_members = orbit_members.apply_mask(pc.is_in(orbit_members.orbit_id, filtered.orbit_id)) return filtered, filtered_orbit_members diff --git a/src/thor/utils/__init__.py b/src/thor/utils/__init__.py index 5ed6fb78..d1d288c0 100644 --- a/src/thor/utils/__init__.py +++ b/src/thor/utils/__init__.py @@ -2,4 +2,3 @@ from .ades import * from .linkages import * from .logging import * -from .quivr import * diff --git a/src/thor/utils/quivr.py b/src/thor/utils/quivr.py deleted file mode 100644 index a039281f..00000000 --- a/src/thor/utils/quivr.py +++ /dev/null @@ -1,54 +0,0 @@ -from typing import List, Literal, Optional - -import numpy as np -import pyarrow as pa -import quivr as qv - - -def drop_duplicates( - table: qv.AnyTable, - subset: Optional[List[str]] = None, - keep: Literal["first", "last"] = "first", -) -> qv.AnyTable: - """ - Drop duplicate rows from a `~quivr.Table`. This function is similar to - `~pandas.DataFrame.drop_duplicates` but it supports nested columns (representing - nested tables). - - Parameters - ---------- - table : `~quivr.Table` - Table to drop duplicate rows from. - subset : list of str, optional - Subset of columns to consider when dropping duplicates. If not specified then - all columns are used. - keep : {'first', 'last'}, default 'first' - If there are duplicate rows then keep the first or last row. - - Returns - ------- - table : `~quivr.Table` - Table with duplicate rows removed. - """ - # Flatten the table so nested columns are dot-delimited at the top level - flattened_table = table.flattened_table() - - # If subset is not specified then use all the columns - if subset is None: - subset = [c for c in flattened_table.column_names] - - # Add an index column to the flattened table - flattened_table = flattened_table.add_column(0, "index", pa.array(np.arange(len(flattened_table)))) - - if keep not in ["first", "last"]: - raise ValueError(f"keep must be 'first' or 'last', got {keep}.") - - agg_func = keep - indices = ( - flattened_table.group_by(subset, use_threads=False) - .aggregate([("index", agg_func)]) - .column(f"index_{agg_func}") - ) - - # Take the indices from the flattened table and use them to index into the original table - return table.take(indices)