Skip to content

Commit

Permalink
Support Pandas 2 (#742)
Browse files Browse the repository at this point in the history
* Fix test setup to match pandas 2.0 demands

* Use the now deprecated _append method

(Better solution might exist)

* Deal with numeric_only being removed in metrics test

* Skip mad metric for other pandas versions

* Account for differences between pandas versions in describe methods

* Run black

* Check Pandas version first

* Mirror behaviour of installed Pandas version when running value_counts

* Allow passing arguments to the individual asserters

* Fix for method _construct_axes_from_arguments no longer existing

* Skip mad metric if it does not exist

* Account for pandas 2.0 timestamp default behaviour

* Deal with empty vs other inferred data types

* Account for default datetime precision change

* Run Black

* Solution for differences in inferred_type only

* Fix csv and json issues

* Skip two doctests

* Passing a set as indexer is no longer allowed

* Don't validate output where it differs between Pandas versions in the environment

* Update test matrix and packaging metadata

* Update version of Python in the docs

* Update Python version in demo notebook

* Match noxfile

* Symmetry

* Fix trailing comma in JSON

* Revert some changes in setup.py to fix building the documentation

* Revert "Revert some changes in setup.py to fix building the documentation"

This reverts commit ea98797.

* Use PANDAS_VERSION from eland.common

* Still skip the doctest, but make the output pandas 2 instead of 1

* Still skip doctest, but switch to pandas 2 output

* Prepare for pandas 3

* Reference the right column

* Ignore output in tests but switch to pandas 2 output

* Add line comment about NBVAL_IGNORE_OUTPUT

* Restore missing line and add stderr cell

* Use non-private method instead

* Fix indentation and parameter issues

* If index is not specified, and pandas 1 is present, set it to True

From pandas 2 and upwards, index is set to None by default

* Run black

* Newer version of black might have different opinions?

* Add line comment

* Remove unused import

* Add reason for ignore statement

* Add reason for skip

---------

Co-authored-by: Quentin Pradet <[email protected]>
  • Loading branch information
bartbroere and pquentin authored Feb 4, 2025
1 parent 77589b2 commit 75c57b0
Show file tree
Hide file tree
Showing 19 changed files with 161 additions and 70 deletions.
7 changes: 6 additions & 1 deletion .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,16 @@ steps:
machineType: "n2-standard-4"
env:
PYTHON_VERSION: "{{ matrix.python }}"
PANDAS_VERSION: '1.5.0'
PANDAS_VERSION: "{{ matrix.pandas }}"
TEST_SUITE: "xpack"
ELASTICSEARCH_VERSION: "{{ matrix.stack }}"
matrix:
setup:
# Python and pandas versions need to be added to the nox configuration too
# (in the decorators of the test method in noxfile.py)
pandas:
- '1.5.0'
- '2.2.3'
python:
- '3.12'
- '3.11'
Expand Down
2 changes: 1 addition & 1 deletion docs/sphinx/examples/demo_notebook.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
"\n",
"For this example, you will need:\n",
"\n",
"- Python 3.8 or later\n",
"- Python 3.9 or later\n",
"- An Elastic deployment\n",
" - We'll be using [Elastic Cloud](https://www.elastic.co/guide/en/cloud/current/ec-getting-started.html) for this example (available with a [free trial](https://cloud.elastic.co/registration))\n",
"\n",
Expand Down
2 changes: 1 addition & 1 deletion eland/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,7 +309,7 @@ def elasticsearch_date_to_pandas_date(


def ensure_es_client(
es_client: Union[str, List[str], Tuple[str, ...], Elasticsearch]
es_client: Union[str, List[str], Tuple[str, ...], Elasticsearch],
) -> Elasticsearch:
if isinstance(es_client, tuple):
es_client = list(es_client)
Expand Down
10 changes: 5 additions & 5 deletions eland/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
from pandas.util._validators import validate_bool_kwarg # type: ignore

import eland.plotting as gfx
from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, docstring_parameter
from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, PANDAS_VERSION, docstring_parameter
from eland.filter import BooleanFilter
from eland.groupby import DataFrameGroupBy
from eland.ndframe import NDFrame
Expand Down Expand Up @@ -411,9 +411,7 @@ def drop(
axis = pd.DataFrame._get_axis_name(axis)
axes = {axis: labels}
elif index is not None or columns is not None:
axes, _ = pd.DataFrame()._construct_axes_from_arguments(
(index, columns), {}
)
axes = {"columns": columns, "index": index}
else:
raise ValueError(
"Need to specify at least one of 'labels', 'index' or 'columns'"
Expand Down Expand Up @@ -1361,7 +1359,7 @@ def to_json(
default_handler=None,
lines=False,
compression="infer",
index=True,
index=None,
indent=None,
storage_options=None,
):
Expand All @@ -1376,6 +1374,8 @@ def to_json(
--------
:pandas_api_docs:`pandas.DataFrame.to_json`
"""
if index is None and PANDAS_VERSION[0] == 1:
index = True # switch to the pandas 1 default
kwargs = {
"path_or_buf": path_or_buf,
"orient": orient,
Expand Down
39 changes: 26 additions & 13 deletions eland/etl.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
# under the License.

import csv
import warnings
from collections import deque
from typing import Any, Dict, Generator, List, Mapping, Optional, Tuple, Union

Expand Down Expand Up @@ -110,15 +111,15 @@ def pandas_to_eland(
2 3.141 1 ... 3 Long text - to be indexed as es type text
<BLANKLINE>
[3 rows x 8 columns]
>>> pd_df.dtypes
A float64
B int64
C object
D datetime64[ns]
E float64
F bool
G int64
H object
>>> pd_df.dtypes # doctest skip required for pandas < 2 # doctest: +SKIP
A float64
B int64
C object
D datetime64[s]
E float64
F bool
G int64
H object
dtype: object
Convert `pandas.DataFrame` to `eland.DataFrame` - this creates an Elasticsearch index called `pandas_to_eland`.
Expand Down Expand Up @@ -307,9 +308,9 @@ def csv_to_eland( # type: ignore
names=None,
index_col=None,
usecols=None,
squeeze=False,
squeeze=None,
prefix=None,
mangle_dupe_cols=True,
mangle_dupe_cols=None,
# General Parsing Configuration
dtype=None,
engine=None,
Expand Down Expand Up @@ -357,6 +358,7 @@ def csv_to_eland( # type: ignore
low_memory: bool = _DEFAULT_LOW_MEMORY,
memory_map=False,
float_precision=None,
**extra_kwargs,
) -> "DataFrame":
"""
Read a comma-separated values (csv) file into eland.DataFrame (i.e. an Elasticsearch index).
Expand Down Expand Up @@ -485,7 +487,6 @@ def csv_to_eland( # type: ignore
"usecols": usecols,
"verbose": verbose,
"encoding": encoding,
"squeeze": squeeze,
"memory_map": memory_map,
"float_precision": float_precision,
"na_filter": na_filter,
Expand All @@ -494,9 +495,9 @@ def csv_to_eland( # type: ignore
"error_bad_lines": error_bad_lines,
"on_bad_lines": on_bad_lines,
"low_memory": low_memory,
"mangle_dupe_cols": mangle_dupe_cols,
"infer_datetime_format": infer_datetime_format,
"skip_blank_lines": skip_blank_lines,
**extra_kwargs,
}

if chunksize is None:
Expand Down Expand Up @@ -525,6 +526,18 @@ def csv_to_eland( # type: ignore

kwargs.pop("on_bad_lines")

if "squeeze" in kwargs:
kwargs.pop("squeeze")
warnings.warn(
"This argument no longer works, use .squeeze('columns') on your DataFrame instead"
)

if "mangle_dupe_cols" in kwargs:
kwargs.pop("mangle_dupe_cols")
warnings.warn(
"The mangle_dupe_cols argument no longer works. Furthermore, "
"duplicate columns will automatically get a number suffix."
)
# read csv in chunks to pandas DataFrame and dump to eland DataFrame (and Elasticsearch)
reader = pd.read_csv(filepath_or_buffer, **kwargs)

Expand Down
7 changes: 5 additions & 2 deletions eland/field_mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -712,8 +712,11 @@ def add_scripted_field(
capabilities, orient="index", columns=FieldMappings.column_labels
)

self._mappings_capabilities = self._mappings_capabilities.append(
capability_matrix_row
self._mappings_capabilities = pd.concat(
[
self._mappings_capabilities,
capability_matrix_row,
]
)

def numeric_source_fields(self) -> List[str]:
Expand Down
17 changes: 13 additions & 4 deletions eland/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,12 @@

import numpy as np
import pandas as pd # type: ignore
from pandas.core.indexes.frozen import FrozenList
from pandas.io.common import _expand_user, stringify_path # type: ignore

import eland.plotting
from eland.arithmetics import ArithmeticNumber, ArithmeticSeries, ArithmeticString
from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, docstring_parameter
from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, PANDAS_VERSION, docstring_parameter
from eland.filter import (
BooleanFilter,
Equal,
Expand Down Expand Up @@ -292,18 +293,26 @@ def value_counts(self, es_size: int = 10) -> pd.Series:
Examples
--------
>>> df = ed.DataFrame('http://localhost:9200', 'flights')
>>> df['Carrier'].value_counts()
>>> df['Carrier'].value_counts() # doctest: +SKIP
Carrier
Logstash Airways 3331
JetBeats 3274
Kibana Airlines 3234
ES-Air 3220
Name: Carrier, dtype: int64
Name: count, dtype: int64
"""
if not isinstance(es_size, int):
raise TypeError("es_size must be a positive integer.")
elif es_size <= 0:
raise ValueError("es_size must be a positive integer.")
return self._query_compiler.value_counts(es_size)
value_counts = self._query_compiler.value_counts(es_size)
# https://pandas.pydata.org/docs/whatsnew/v2.0.0.html#value-counts-sets-the-resulting-name-to-count
if PANDAS_VERSION[0] == 2:
value_counts.name = "count"
value_counts.index.names = FrozenList([self.es_field_name])
value_counts.index.name = self.es_field_name

return value_counts

# dtype not implemented for Series as causes query to fail
# in pandas.core.computation.ops.Term.type
Expand Down
2 changes: 1 addition & 1 deletion noxfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def lint(session):


@nox.session(python=["3.9", "3.10", "3.11", "3.12"])
@nox.parametrize("pandas_version", ["1.5.0"])
@nox.parametrize("pandas_version", ["1.5.0", "2.2.3"])
def test(session, pandas_version: str):
session.install("-r", "requirements-dev.txt")
session.install(".")
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@
packages=find_packages(include=["eland", "eland.*"]),
install_requires=[
"elasticsearch>=8.3,<9",
"pandas>=1.5,<2",
"pandas>=1.5,<3",
"matplotlib>=3.6",
"numpy>=1.2.0,<2",
"packaging",
Expand Down
8 changes: 6 additions & 2 deletions tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from pandas.testing import assert_frame_equal, assert_series_equal

import eland as ed
from eland.common import PANDAS_VERSION

ROOT_DIR = os.path.dirname(os.path.abspath(__file__))

Expand All @@ -45,7 +46,10 @@
_pd_flights = pd.DataFrame.from_records(flight_records).reindex(
_ed_flights.columns, axis=1
)
_pd_flights["timestamp"] = pd.to_datetime(_pd_flights["timestamp"])
if PANDAS_VERSION[0] >= 2:
_pd_flights["timestamp"] = pd.to_datetime(_pd_flights["timestamp"], format="mixed")
else:
_pd_flights["timestamp"] = pd.to_datetime(_pd_flights["timestamp"])
# Mimic what copy_to in an Elasticsearch mapping would do, combining the two fields in a list
_pd_flights["Cities"] = _pd_flights.apply(
lambda x: list(sorted([x["OriginCityName"], x["DestCityName"]])), axis=1
Expand All @@ -62,7 +66,7 @@
)
_pd_ecommerce.insert(2, "customer_birth_date", None)
_pd_ecommerce.index = _pd_ecommerce.index.map(str) # make index 'object' not int
_pd_ecommerce["customer_birth_date"].astype("datetime64")
_pd_ecommerce["customer_birth_date"].astype("datetime64[ns]")
_ed_ecommerce = ed.DataFrame(ES_TEST_CLIENT, ECOMMERCE_INDEX_NAME)


Expand Down
21 changes: 15 additions & 6 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,24 +77,33 @@ def f(*args, **kwargs):
pd_exc = e

self.check_exception(ed_exc, pd_exc)
self.check_values(ed_obj, pd_obj)
try:
self.check_values(ed_obj, pd_obj)
except AssertionError as e:
# This is an attribute we allow to differ when comparing zero-length objects
if (
'Attribute "inferred_type" are different' in repr(e)
and len(ed_obj) == 0
and len(pd_obj) == 0
):
self.check_values(ed_obj, pd_obj, check_index_type=False)

if isinstance(ed_obj, (ed.DataFrame, ed.Series)):
return SymmetricAPIChecker(ed_obj, pd_obj)
return pd_obj

return f

def check_values(self, ed_obj, pd_obj):
def check_values(self, ed_obj, pd_obj, **kwargs):
"""Checks that any two values coming from eland and pandas are equal"""
if isinstance(ed_obj, ed.DataFrame):
assert_pandas_eland_frame_equal(pd_obj, ed_obj)
assert_pandas_eland_frame_equal(pd_obj, ed_obj, **kwargs)
elif isinstance(ed_obj, ed.Series):
assert_pandas_eland_series_equal(pd_obj, ed_obj)
assert_pandas_eland_series_equal(pd_obj, ed_obj, **kwargs)
elif isinstance(ed_obj, pd.DataFrame):
assert_frame_equal(ed_obj, pd_obj)
assert_frame_equal(ed_obj, pd_obj, **kwargs)
elif isinstance(ed_obj, pd.Series):
assert_series_equal(ed_obj, pd_obj)
assert_series_equal(ed_obj, pd_obj, **kwargs)
elif isinstance(ed_obj, pd.Index):
assert ed_obj.equals(pd_obj)
else:
Expand Down
2 changes: 2 additions & 0 deletions tests/dataframe/test_datetime_pytest.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,8 @@ def test_datetime_to_ms(self):
},
index=["0", "1", "2"],
)
# https://pandas.pydata.org/docs/whatsnew/v2.0.0.html#construction-with-datetime64-or-timedelta64-dtype-with-unsupported-resolution
df["D"] = df["D"].astype("datetime64[ns]")

expected_mappings = {
"mappings": {
Expand Down
12 changes: 10 additions & 2 deletions tests/dataframe/test_describe_pytest.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,17 @@ def test_flights_describe(self):
["Cancelled", "FlightDelay"], axis="columns"
)

# Pandas >= 2 calculates aggregations such as min and max for timestamps too
# This could be implemented in eland, but as of yet this is not the case
# We therefore remove it before the comparison
if "timestamp" in pd_describe.columns:
pd_describe = pd_describe.drop(["timestamp"], axis="columns")

# Pandas >= 2 orders the aggregations differently than Pandas < 2
# A sort_index is applied so tests will succeed in both environments
assert_frame_equal(
pd_describe.drop(["25%", "50%", "75%"], axis="index"),
ed_describe.drop(["25%", "50%", "75%"], axis="index"),
pd_describe.drop(["25%", "50%", "75%"], axis="index").sort_index(),
ed_describe.drop(["25%", "50%", "75%"], axis="index").sort_index(),
check_exact=False,
rtol=True,
)
Expand Down
2 changes: 1 addition & 1 deletion tests/dataframe/test_head_tail_pytest.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def test_head_0(self):

ed_head_0 = ed_flights.head(0)
pd_head_0 = pd_flights.head(0)
assert_pandas_eland_frame_equal(pd_head_0, ed_head_0)
assert_pandas_eland_frame_equal(pd_head_0, ed_head_0, check_index_type=False)

def test_doc_test_tail(self):
df = self.ed_flights()
Expand Down
16 changes: 14 additions & 2 deletions tests/dataframe/test_metrics_pytest.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import pytest
from pandas.testing import assert_frame_equal, assert_series_equal

from eland.common import PANDAS_VERSION
from tests.common import TestData, assert_almost_equal


Expand Down Expand Up @@ -74,6 +75,8 @@ def test_flights_extended_metrics(self):
logger.setLevel(logging.DEBUG)

for func in self.extended_funcs:
if PANDAS_VERSION[0] >= 2 and func == "mad":
continue
pd_metric = getattr(pd_flights, func)(
**({"numeric_only": True} if func != "mad" else {})
)
Expand All @@ -92,6 +95,8 @@ def test_flights_extended_metrics_nan(self):
ed_flights_1 = ed_flights[ed_flights.FlightNum == "9HY9SWR"][["AvgTicketPrice"]]

for func in self.extended_funcs:
if PANDAS_VERSION[0] >= 2 and func == "mad":
continue
pd_metric = getattr(pd_flights_1, func)()
ed_metric = getattr(ed_flights_1, func)(numeric_only=False)

Expand All @@ -102,6 +107,8 @@ def test_flights_extended_metrics_nan(self):
ed_flights_0 = ed_flights[ed_flights.FlightNum == "XXX"][["AvgTicketPrice"]]

for func in self.extended_funcs:
if PANDAS_VERSION[0] >= 2 and func == "mad":
continue
pd_metric = getattr(pd_flights_0, func)()
ed_metric = getattr(ed_flights_0, func)(numeric_only=False)

Expand Down Expand Up @@ -491,8 +498,13 @@ def test_flights_agg_quantile(self, numeric_only):
["AvgTicketPrice", "FlightDelayMin", "dayOfWeek"]
)

pd_quantile = pd_flights.agg(["quantile", "min"], numeric_only=numeric_only)
ed_quantile = ed_flights.agg(["quantile", "min"], numeric_only=numeric_only)
if PANDAS_VERSION[0] == 1:
pd_quantile = pd_flights.agg(["quantile", "min"], numeric_only=numeric_only)
ed_quantile = ed_flights.agg(["quantile", "min"], numeric_only=numeric_only)

else: # numeric_only is no longer available for pandas > 2
pd_quantile = pd_flights.agg(["quantile", "min"])
ed_quantile = ed_flights.agg(["quantile", "min"])

assert_frame_equal(
pd_quantile, ed_quantile, check_exact=False, rtol=4, check_dtype=False
Expand Down
Loading

0 comments on commit 75c57b0

Please sign in to comment.