Support Pandas 2 (#742)

* Fix test setup to match pandas 2.0 demands * Use the now deprecated _append method (Better solution might exist) * Deal with numeric_only being removed in metrics test * Skip mad metric for other pandas versions * Account for differences between pandas versions in describe methods * Run black * Check Pandas version first * Mirror behaviour of installed Pandas version when running value_counts * Allow passing arguments to the individual asserters * Fix for method _construct_axes_from_arguments no longer existing * Skip mad metric if it does not exist * Account for pandas 2.0 timestamp default behaviour * Deal with empty vs other inferred data types * Account for default datetime precision change * Run Black * Solution for differences in inferred_type only * Fix csv and json issues * Skip two doctests * Passing a set as indexer is no longer allowed * Don't validate output where it differs between Pandas versions in the environment * Update test matrix and packaging metadata * Update version of Python in the docs * Update Python version in demo notebook * Match noxfile * Symmetry * Fix trailing comma in JSON * Revert some changes in setup.py to fix building the documentation * Revert "Revert some changes in setup.py to fix building the documentation" This reverts commit ea98797. * Use PANDAS_VERSION from eland.common * Still skip the doctest, but make the output pandas 2 instead of 1 * Still skip doctest, but switch to pandas 2 output * Prepare for pandas 3 * Reference the right column * Ignore output in tests but switch to pandas 2 output * Add line comment about NBVAL_IGNORE_OUTPUT * Restore missing line and add stderr cell * Use non-private method instead * Fix indentation and parameter issues * If index is not specified, and pandas 1 is present, set it to True From pandas 2 and upwards, index is set to None by default * Run black * Newer version of black might have different opinions? * Add line comment * Remove unused import * Add reason for ignore statement * Add reason for skip --------- Co-authored-by: Quentin Pradet <[email protected]>
elastic · Feb 4, 2025 · 75c57b0 · 75c57b0
1 parent 77589b2
commit 75c57b0
Show file tree

Hide file tree

Showing 19 changed files with 161 additions and 70 deletions.
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -29,11 +29,16 @@ steps:
       machineType: "n2-standard-4"
     env:
       PYTHON_VERSION: "{{ matrix.python }}"
-      PANDAS_VERSION: '1.5.0'
+      PANDAS_VERSION: "{{ matrix.pandas }}"
       TEST_SUITE: "xpack"
       ELASTICSEARCH_VERSION: "{{ matrix.stack }}"
     matrix:
       setup:
+        # Python and pandas versions need to be added to the nox configuration too
+        # (in the decorators of the test method in noxfile.py)
+        pandas:
+          - '1.5.0'
+          - '2.2.3'
         python:
           - '3.12'
           - '3.11'

diff --git a/docs/sphinx/examples/demo_notebook.ipynb b/docs/sphinx/examples/demo_notebook.ipynb
@@ -24,7 +24,7 @@
         "\n",
         "For this example, you will need:\n",
         "\n",
-        "- Python 3.8 or later\n",
+        "- Python 3.9 or later\n",
         "- An Elastic deployment\n",
         "  - We'll be using [Elastic Cloud](https://www.elastic.co/guide/en/cloud/current/ec-getting-started.html) for this example (available with a [free trial](https://cloud.elastic.co/registration))\n",
         "\n",

diff --git a/eland/common.py b/eland/common.py
@@ -309,7 +309,7 @@ def elasticsearch_date_to_pandas_date(
 
 
 def ensure_es_client(
-    es_client: Union[str, List[str], Tuple[str, ...], Elasticsearch]
+    es_client: Union[str, List[str], Tuple[str, ...], Elasticsearch],
 ) -> Elasticsearch:
     if isinstance(es_client, tuple):
         es_client = list(es_client)

diff --git a/eland/dataframe.py b/eland/dataframe.py
@@ -34,7 +34,7 @@
 from pandas.util._validators import validate_bool_kwarg  # type: ignore
 
 import eland.plotting as gfx
-from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, docstring_parameter
+from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, PANDAS_VERSION, docstring_parameter
 from eland.filter import BooleanFilter
 from eland.groupby import DataFrameGroupBy
 from eland.ndframe import NDFrame
@@ -411,9 +411,7 @@ def drop(
             axis = pd.DataFrame._get_axis_name(axis)
             axes = {axis: labels}
         elif index is not None or columns is not None:
-            axes, _ = pd.DataFrame()._construct_axes_from_arguments(
-                (index, columns), {}
-            )
+            axes = {"columns": columns, "index": index}
         else:
             raise ValueError(
                 "Need to specify at least one of 'labels', 'index' or 'columns'"
@@ -1361,7 +1359,7 @@ def to_json(
         default_handler=None,
         lines=False,
         compression="infer",
-        index=True,
+        index=None,
         indent=None,
         storage_options=None,
     ):
@@ -1376,6 +1374,8 @@ def to_json(
         --------
         :pandas_api_docs:`pandas.DataFrame.to_json`
         """
+        if index is None and PANDAS_VERSION[0] == 1:
+            index = True  # switch to the pandas 1 default
         kwargs = {
             "path_or_buf": path_or_buf,
             "orient": orient,

diff --git a/eland/etl.py b/eland/etl.py
@@ -16,6 +16,7 @@
 #  under the License.
 
 import csv
+import warnings
 from collections import deque
 from typing import Any, Dict, Generator, List, Mapping, Optional, Tuple, Union
 
@@ -110,15 +111,15 @@ def pandas_to_eland(
     2  3.141  1  ...  3  Long text - to be indexed as es type text
     <BLANKLINE>
     [3 rows x 8 columns]
-    >>> pd_df.dtypes
-    A           float64
-    B             int64
-    C            object
-    D    datetime64[ns]
-    E           float64
-    F              bool
-    G             int64
-    H            object
+    >>> pd_df.dtypes  # doctest skip required for pandas < 2  # doctest: +SKIP
+    A          float64
+    B            int64
+    C           object
+    D    datetime64[s]
+    E          float64
+    F             bool
+    G            int64
+    H           object
     dtype: object
 
     Convert `pandas.DataFrame` to `eland.DataFrame` - this creates an Elasticsearch index called `pandas_to_eland`.
@@ -307,9 +308,9 @@ def csv_to_eland(  # type: ignore
     names=None,
     index_col=None,
     usecols=None,
-    squeeze=False,
+    squeeze=None,
     prefix=None,
-    mangle_dupe_cols=True,
+    mangle_dupe_cols=None,
     # General Parsing Configuration
     dtype=None,
     engine=None,
@@ -357,6 +358,7 @@ def csv_to_eland(  # type: ignore
     low_memory: bool = _DEFAULT_LOW_MEMORY,
     memory_map=False,
     float_precision=None,
+    **extra_kwargs,
 ) -> "DataFrame":
     """
     Read a comma-separated values (csv) file into eland.DataFrame (i.e. an Elasticsearch index).
@@ -485,7 +487,6 @@ def csv_to_eland(  # type: ignore
         "usecols": usecols,
         "verbose": verbose,
         "encoding": encoding,
-        "squeeze": squeeze,
         "memory_map": memory_map,
         "float_precision": float_precision,
         "na_filter": na_filter,
@@ -494,9 +495,9 @@ def csv_to_eland(  # type: ignore
         "error_bad_lines": error_bad_lines,
         "on_bad_lines": on_bad_lines,
         "low_memory": low_memory,
-        "mangle_dupe_cols": mangle_dupe_cols,
         "infer_datetime_format": infer_datetime_format,
         "skip_blank_lines": skip_blank_lines,
+        **extra_kwargs,
     }
 
     if chunksize is None:
@@ -525,6 +526,18 @@ def csv_to_eland(  # type: ignore
 
         kwargs.pop("on_bad_lines")
 
+    if "squeeze" in kwargs:
+        kwargs.pop("squeeze")
+        warnings.warn(
+            "This argument no longer works, use .squeeze('columns') on your DataFrame instead"
+        )
+
+    if "mangle_dupe_cols" in kwargs:
+        kwargs.pop("mangle_dupe_cols")
+        warnings.warn(
+            "The mangle_dupe_cols argument no longer works. Furthermore, "
+            "duplicate columns will automatically get a number suffix."
+        )
     # read csv in chunks to pandas DataFrame and dump to eland DataFrame (and Elasticsearch)
     reader = pd.read_csv(filepath_or_buffer, **kwargs)
 

diff --git a/eland/field_mappings.py b/eland/field_mappings.py
@@ -712,8 +712,11 @@ def add_scripted_field(
             capabilities, orient="index", columns=FieldMappings.column_labels
         )
 
-        self._mappings_capabilities = self._mappings_capabilities.append(
-            capability_matrix_row
+        self._mappings_capabilities = pd.concat(
+            [
+                self._mappings_capabilities,
+                capability_matrix_row,
+            ]
         )
 
     def numeric_source_fields(self) -> List[str]:

diff --git a/eland/series.py b/eland/series.py
@@ -40,11 +40,12 @@
 
 import numpy as np
 import pandas as pd  # type: ignore
+from pandas.core.indexes.frozen import FrozenList
 from pandas.io.common import _expand_user, stringify_path  # type: ignore
 
 import eland.plotting
 from eland.arithmetics import ArithmeticNumber, ArithmeticSeries, ArithmeticString
-from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, docstring_parameter
+from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, PANDAS_VERSION, docstring_parameter
 from eland.filter import (
     BooleanFilter,
     Equal,
@@ -292,18 +293,26 @@ def value_counts(self, es_size: int = 10) -> pd.Series:
         Examples
         --------
         >>> df = ed.DataFrame('http://localhost:9200', 'flights')
-        >>> df['Carrier'].value_counts()
+        >>> df['Carrier'].value_counts()  # doctest: +SKIP
+        Carrier
         Logstash Airways    3331
         JetBeats            3274
         Kibana Airlines     3234
         ES-Air              3220
-        Name: Carrier, dtype: int64
+        Name: count, dtype: int64
         """
         if not isinstance(es_size, int):
             raise TypeError("es_size must be a positive integer.")
         elif es_size <= 0:
             raise ValueError("es_size must be a positive integer.")
-        return self._query_compiler.value_counts(es_size)
+        value_counts = self._query_compiler.value_counts(es_size)
+        # https://pandas.pydata.org/docs/whatsnew/v2.0.0.html#value-counts-sets-the-resulting-name-to-count
+        if PANDAS_VERSION[0] == 2:
+            value_counts.name = "count"
+            value_counts.index.names = FrozenList([self.es_field_name])
+            value_counts.index.name = self.es_field_name
+
+        return value_counts
 
     # dtype not implemented for Series as causes query to fail
     # in pandas.core.computation.ops.Term.type

diff --git a/noxfile.py b/noxfile.py
@@ -101,7 +101,7 @@ def lint(session):
 
 
 @nox.session(python=["3.9", "3.10", "3.11", "3.12"])
-@nox.parametrize("pandas_version", ["1.5.0"])
+@nox.parametrize("pandas_version", ["1.5.0", "2.2.3"])
 def test(session, pandas_version: str):
     session.install("-r", "requirements-dev.txt")
     session.install(".")

diff --git a/setup.py b/setup.py
@@ -87,7 +87,7 @@
     packages=find_packages(include=["eland", "eland.*"]),
     install_requires=[
         "elasticsearch>=8.3,<9",
-        "pandas>=1.5,<2",
+        "pandas>=1.5,<3",
         "matplotlib>=3.6",
         "numpy>=1.2.0,<2",
         "packaging",

diff --git a/tests/common.py b/tests/common.py
@@ -24,6 +24,7 @@
 from pandas.testing import assert_frame_equal, assert_series_equal
 
 import eland as ed
+from eland.common import PANDAS_VERSION
 
 ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
 
@@ -45,7 +46,10 @@
 _pd_flights = pd.DataFrame.from_records(flight_records).reindex(
     _ed_flights.columns, axis=1
 )
-_pd_flights["timestamp"] = pd.to_datetime(_pd_flights["timestamp"])
+if PANDAS_VERSION[0] >= 2:
+    _pd_flights["timestamp"] = pd.to_datetime(_pd_flights["timestamp"], format="mixed")
+else:
+    _pd_flights["timestamp"] = pd.to_datetime(_pd_flights["timestamp"])
 # Mimic what copy_to in an Elasticsearch mapping would do, combining the two fields in a list
 _pd_flights["Cities"] = _pd_flights.apply(
     lambda x: list(sorted([x["OriginCityName"], x["DestCityName"]])), axis=1
@@ -62,7 +66,7 @@
 )
 _pd_ecommerce.insert(2, "customer_birth_date", None)
 _pd_ecommerce.index = _pd_ecommerce.index.map(str)  # make index 'object' not int
-_pd_ecommerce["customer_birth_date"].astype("datetime64")
+_pd_ecommerce["customer_birth_date"].astype("datetime64[ns]")
 _ed_ecommerce = ed.DataFrame(ES_TEST_CLIENT, ECOMMERCE_INDEX_NAME)
 
 

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -77,24 +77,33 @@ def f(*args, **kwargs):
                 pd_exc = e
 
             self.check_exception(ed_exc, pd_exc)
-            self.check_values(ed_obj, pd_obj)
+            try:
+                self.check_values(ed_obj, pd_obj)
+            except AssertionError as e:
+                # This is an attribute we allow to differ when comparing zero-length objects
+                if (
+                    'Attribute "inferred_type" are different' in repr(e)
+                    and len(ed_obj) == 0
+                    and len(pd_obj) == 0
+                ):
+                    self.check_values(ed_obj, pd_obj, check_index_type=False)
 
             if isinstance(ed_obj, (ed.DataFrame, ed.Series)):
                 return SymmetricAPIChecker(ed_obj, pd_obj)
             return pd_obj
 
         return f
 
-    def check_values(self, ed_obj, pd_obj):
+    def check_values(self, ed_obj, pd_obj, **kwargs):
         """Checks that any two values coming from eland and pandas are equal"""
         if isinstance(ed_obj, ed.DataFrame):
-            assert_pandas_eland_frame_equal(pd_obj, ed_obj)
+            assert_pandas_eland_frame_equal(pd_obj, ed_obj, **kwargs)
         elif isinstance(ed_obj, ed.Series):
-            assert_pandas_eland_series_equal(pd_obj, ed_obj)
+            assert_pandas_eland_series_equal(pd_obj, ed_obj, **kwargs)
         elif isinstance(ed_obj, pd.DataFrame):
-            assert_frame_equal(ed_obj, pd_obj)
+            assert_frame_equal(ed_obj, pd_obj, **kwargs)
         elif isinstance(ed_obj, pd.Series):
-            assert_series_equal(ed_obj, pd_obj)
+            assert_series_equal(ed_obj, pd_obj, **kwargs)
         elif isinstance(ed_obj, pd.Index):
             assert ed_obj.equals(pd_obj)
         else:

diff --git a/tests/dataframe/test_datetime_pytest.py b/tests/dataframe/test_datetime_pytest.py
@@ -87,6 +87,8 @@ def test_datetime_to_ms(self):
             },
             index=["0", "1", "2"],
         )
+        # https://pandas.pydata.org/docs/whatsnew/v2.0.0.html#construction-with-datetime64-or-timedelta64-dtype-with-unsupported-resolution
+        df["D"] = df["D"].astype("datetime64[ns]")
 
         expected_mappings = {
             "mappings": {

diff --git a/tests/dataframe/test_describe_pytest.py b/tests/dataframe/test_describe_pytest.py
@@ -33,9 +33,17 @@ def test_flights_describe(self):
             ["Cancelled", "FlightDelay"], axis="columns"
         )
 
+        # Pandas >= 2 calculates aggregations such as min and max for timestamps too
+        # This could be implemented in eland, but as of yet this is not the case
+        # We therefore remove it before the comparison
+        if "timestamp" in pd_describe.columns:
+            pd_describe = pd_describe.drop(["timestamp"], axis="columns")
+
+        # Pandas >= 2 orders the aggregations differently than Pandas < 2
+        # A sort_index is applied so tests will succeed in both environments
         assert_frame_equal(
-            pd_describe.drop(["25%", "50%", "75%"], axis="index"),
-            ed_describe.drop(["25%", "50%", "75%"], axis="index"),
+            pd_describe.drop(["25%", "50%", "75%"], axis="index").sort_index(),
+            ed_describe.drop(["25%", "50%", "75%"], axis="index").sort_index(),
             check_exact=False,
             rtol=True,
         )

diff --git a/tests/dataframe/test_head_tail_pytest.py b/tests/dataframe/test_head_tail_pytest.py
@@ -99,7 +99,7 @@ def test_head_0(self):
 
         ed_head_0 = ed_flights.head(0)
         pd_head_0 = pd_flights.head(0)
-        assert_pandas_eland_frame_equal(pd_head_0, ed_head_0)
+        assert_pandas_eland_frame_equal(pd_head_0, ed_head_0, check_index_type=False)
 
     def test_doc_test_tail(self):
         df = self.ed_flights()

diff --git a/tests/dataframe/test_metrics_pytest.py b/tests/dataframe/test_metrics_pytest.py
@@ -22,6 +22,7 @@
 import pytest
 from pandas.testing import assert_frame_equal, assert_series_equal
 
+from eland.common import PANDAS_VERSION
 from tests.common import TestData, assert_almost_equal
 
 
@@ -74,6 +75,8 @@ def test_flights_extended_metrics(self):
         logger.setLevel(logging.DEBUG)
 
         for func in self.extended_funcs:
+            if PANDAS_VERSION[0] >= 2 and func == "mad":
+                continue
             pd_metric = getattr(pd_flights, func)(
                 **({"numeric_only": True} if func != "mad" else {})
             )
@@ -92,6 +95,8 @@ def test_flights_extended_metrics_nan(self):
         ed_flights_1 = ed_flights[ed_flights.FlightNum == "9HY9SWR"][["AvgTicketPrice"]]
 
         for func in self.extended_funcs:
+            if PANDAS_VERSION[0] >= 2 and func == "mad":
+                continue
             pd_metric = getattr(pd_flights_1, func)()
             ed_metric = getattr(ed_flights_1, func)(numeric_only=False)
 
@@ -102,6 +107,8 @@ def test_flights_extended_metrics_nan(self):
         ed_flights_0 = ed_flights[ed_flights.FlightNum == "XXX"][["AvgTicketPrice"]]
 
         for func in self.extended_funcs:
+            if PANDAS_VERSION[0] >= 2 and func == "mad":
+                continue
             pd_metric = getattr(pd_flights_0, func)()
             ed_metric = getattr(ed_flights_0, func)(numeric_only=False)
 
@@ -491,8 +498,13 @@ def test_flights_agg_quantile(self, numeric_only):
             ["AvgTicketPrice", "FlightDelayMin", "dayOfWeek"]
         )
 
-        pd_quantile = pd_flights.agg(["quantile", "min"], numeric_only=numeric_only)
-        ed_quantile = ed_flights.agg(["quantile", "min"], numeric_only=numeric_only)
+        if PANDAS_VERSION[0] == 1:
+            pd_quantile = pd_flights.agg(["quantile", "min"], numeric_only=numeric_only)
+            ed_quantile = ed_flights.agg(["quantile", "min"], numeric_only=numeric_only)
+
+        else:  # numeric_only is no longer available for pandas > 2
+            pd_quantile = pd_flights.agg(["quantile", "min"])
+            ed_quantile = ed_flights.agg(["quantile", "min"])
 
         assert_frame_equal(
             pd_quantile, ed_quantile, check_exact=False, rtol=4, check_dtype=False