Skip to content

Commit

Permalink
Return dataframe with columns properly cast (#104)
Browse files Browse the repository at this point in the history
* feat: add dateparser

* refactor: more robust date detection

* feat: add datetime to fields

* feat: create bool casting function

* feat: add datetime to init

* feat: add more date labels

* refactor: consistency renaming

* feat: add df casting process

* fix: better label test

* feat: add cast test

* refactor: cast column as int

* feat: tests

* fix: use optional instead of pipe

* docs: update changelog

* docs: hint type

Co-authored-by: Adrien Carpentier <[email protected]>

* docs: hint type

Co-authored-by: Adrien Carpentier <[email protected]>

* docs: hint type

Co-authored-by: Adrien Carpentier <[email protected]>

* docs: hint type

Co-authored-by: Adrien Carpentier <[email protected]>

* fix: add missing import

* fix: cast specific datetime formats to datetime

* feat: add option to not cast json columns

---------

Co-authored-by: Adrien Carpentier <[email protected]>
  • Loading branch information
Pierlou and bolinocroustibat authored Dec 16, 2024
1 parent abe0b14 commit 0bc8d8b
Show file tree
Hide file tree
Showing 12 changed files with 187 additions and 61 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
- New function that creates a csv from a list of fields and constraints, or from a TableSchema [#101](https://github.com/datagouv/csv-detective/pull/101)
- Enable outputing loaded dataframe [#102](https://github.com/datagouv/csv-detective/pull/102)
- Better naming, hint types and minor refactors [#103](https://github.com/datagouv/csv-detective/pull/103)
- The returned dataframe has its columns properly cast to the detected types [#104](https://github.com/datagouv/csv-detective/pull/104)

## 0.7.4 (2024-11-15)

Expand Down
2 changes: 1 addition & 1 deletion csv_detective/detect_fields/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,4 +54,4 @@
)

from .FR.temp import jour_de_la_semaine, mois_de_annee
from .temp import year, date, datetime_iso, datetime_rfc822
from .temp import year, date, datetime, datetime_iso, datetime_rfc822
38 changes: 22 additions & 16 deletions csv_detective/detect_fields/other/booleen/__init__.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,27 @@
PROPORTION = 1
liste_bool = {
'0',
'1',
'vrai',
'faux',
'true',
'false',
'oui',
'non',
'yes',
'no',
'y',
'n',
'o'
bool_mapping = {
"1": True,
"0": False,
"vrai": True,
"faux": False,
"true": True,
"false": False,
"oui": True,
"non": False,
"yes": True,
"no": False,
"y": True,
"n": False,
"o": True,
}

liste_bool = set(bool_mapping.keys())

def _is(val):
'''Détection les booléens'''

def bool_casting(val: str) -> bool:
return bool_mapping.get(val)


def _is(val: str) -> bool:
'''Détecte les booléens'''
return isinstance(val, str) and val.lower() in liste_bool
4 changes: 2 additions & 2 deletions csv_detective/detect_fields/other/float/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
PROPORTION = 1


def float_casting(str2cast):
return float(str2cast.replace(',', '.'))
def float_casting(val: str) -> float:
return float(val.replace(',', '.'))


def _is(val):
Expand Down
58 changes: 21 additions & 37 deletions csv_detective/detect_fields/temp/date/__init__.py
Original file line number Diff line number Diff line change
@@ -1,46 +1,30 @@
import re
from dateutil.parser import parse, ParserError
from csv_detective.detect_fields.other.float import _is as is_float
from unidecode import unidecode
from datetime import datetime
from typing import Optional

from dateparser import parse as date_parser
from dateutil.parser import parse as dateutil_parser, ParserError

PROPORTION = 1
# /!\ this is only for dates, not datetimes which are handled by other utils


def is_dateutil_date(val: str) -> bool:
# we don't want to get datetimes here, so length restriction
# longest date string expected here is DD-septembre-YYYY, so 17 characters
if len(val) > 17:
return False
def date_casting(val: str) -> Optional[datetime]:
"""For performance reasons, we try first with dateutil and fallback on dateparser"""
try:
res = parse(val, fuzzy=False)
if res.hour or res.minute or res.second:
return False
return True
except (ParserError, ValueError, TypeError, OverflowError):
return False


seps = r'[\s/\-\*_\|;.,]'
# matches JJ-MM-AAAA with any of the listed separators
pat = r'^(0[1-9]|[12][0-9]|3[01])SEP(0[1-9]|1[0-2])SEP((19|20)\d{2})$'.replace('SEP', seps)
# matches AAAA-MM-JJ with any of the listed separators OR NO SEPARATOR
tap = r'^((19|20)\d{2})SEP(0[1-9]|1[0-2])SEP(0[1-9]|[12][0-9]|3[01])$'.replace('SEP', seps + '?')
# matches JJ-mmm-AAAA and JJ-mmm...mm-AAAA with any of the listed separators OR NO SEPARATOR
letters = (
r'^(0[1-9]|[12][0-9]|3[01])SEP(jan|fev|feb|mar|avr|apr'
r'|mai|may|jun|jui|jul|aou|aug|sep|oct|nov|dec|janvier|fevrier|mars|avril|'
r'mai|juin|jullet|aout|septembre|octobre|novembre|decembre)SEP'
r'(\d{2}|\d{4})$'
).replace('SEP', seps + '?')
return dateutil_parser(val)
except ParserError:
return date_parser(val)


def _is(val):
'''Renvoie True si val peut être une date, False sinon
On ne garde que les regex pour les cas où parse() ne convient pas'''
return isinstance(val, str) and (
(is_dateutil_date(val) and not is_float(val))
or bool(re.match(letters, unidecode(val)))
or bool(re.match(pat, val))
or bool(re.match(tap, val))
)
'''Renvoie True si val peut être une date, False sinon'''
# early stops, to cut processing time
if not isinstance(val, str) or len(val) > 20 or len(val) < 8:
return False
threshold = 0.3
if sum([char.isdigit() for char in val]) / len(val) < threshold:
return False
res = date_casting(val)
if not res or res.hour or res.minute or res.second:
return False
return True
19 changes: 19 additions & 0 deletions csv_detective/detect_fields/temp/datetime/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from typing import Any, Optional

from csv_detective.detect_fields.temp.date import date_casting

PROPORTION = 1


def _is(val: Optional[Any]) -> bool:
'''Renvoie True si val peut être un datetime, False sinon'''
# early stops, to cut processing time
if not isinstance(val, str) or len(val) > 30 or len(val) < 15:
return False
threshold = 0.7
if sum([char.isdigit() for char in val]) / len(val) < threshold:
return False
res = date_casting(val)
if res and (res.hour or res.minute or res.second):
return True
return False
4 changes: 3 additions & 1 deletion csv_detective/detect_labels/temp/date/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,9 @@ def _is(header):
'dateouv',
'date der maj',
'dmaj',
'jour'
'jour',
'yyyymmdd',
'aaaammjj',
]
processed_header = _process_text(header)

Expand Down
19 changes: 17 additions & 2 deletions csv_detective/explore_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,13 @@
from csv_detective import detect_fields, detect_labels
from csv_detective.s3_utils import download_from_minio, upload_to_minio
from csv_detective.schema_generation import generate_table_schema
from csv_detective.utils import test_col, test_label, prepare_output_dict, display_logs_depending_process_time
from csv_detective.utils import (
cast_df,
display_logs_depending_process_time,
prepare_output_dict,
test_col,
test_label,
)
from .detection import (
detect_engine,
detect_separator,
Expand Down Expand Up @@ -111,6 +117,7 @@ def routine(
output_profile: bool = False,
output_schema: bool = False,
output_df: bool = False,
cast_json: bool = True,
verbose: bool = False,
sheet_name: Union[str, int] = None,
) -> Union[dict, tuple[dict, pd.DataFrame]]:
Expand All @@ -127,6 +134,7 @@ def routine(
output_profile: whether or not to add the 'profile' field to the output
output_schema: whether or not to add the 'schema' field to the output (tableschema)
output_df: whether or not to return the loaded DataFrame along with the analysis report
cast_json: whether or not to cast json columns into objects (otherwise they are returned as strings)
verbose: whether or not to print process logs in console
sheet_name: if reading multi-sheet file (xls-like), which sheet to consider
skipna: whether to keep NaN (empty cells) for tests
Expand Down Expand Up @@ -276,6 +284,8 @@ def routine(
"json": "json",
"json_geojson": "json",
"datetime": "datetime",
"datetime_iso": "datetime",
"datetime_rfc822": "datetime",
"date": "date",
"latitude": "float",
"latitude_l93": "float",
Expand Down Expand Up @@ -352,7 +362,12 @@ def routine(
time() - start_routine
)
if output_df:
return analysis, table
return analysis, cast_df(
df=table,
columns=analysis["columns"],
cast_json=cast_json,
verbose=verbose,
)
return analysis


Expand Down
55 changes: 53 additions & 2 deletions csv_detective/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
from typing import Callable
from typing import Callable, Optional, Union
import json
import pandas as pd
import logging
from time import time
from datetime import date, datetime

from csv_detective.detect_fields.other.booleen import bool_casting
from csv_detective.detect_fields.other.float import float_casting
from csv_detective.detect_fields.temp.date import date_casting

logging.basicConfig(level=logging.INFO)

Expand Down Expand Up @@ -210,7 +216,52 @@ def prepare_output_dict(return_table: pd.DataFrame, limited_output: bool):

def full_word_strictly_inside_string(word: str, string: str):
return (
(" " + word + " " in string)
word == string
or (" " + word + " " in string)
or (string.startswith(word + " "))
or (string.endswith(" " + word))
)


def cast(value: str, _type: str) -> Optional[Union[str, float, bool, date, datetime]]:
if not isinstance(value, str) or not value:
# None is the current default value in hydra, should we keep this?
return None
if _type == "float":
return float_casting(value)
if _type == "bool":
return bool_casting(value)
if _type == "json":
# in hydra json are given to postgres as strings, conversion is done by postgres
return json.loads(value)
if _type == "date":
_date = date_casting(value)
return _date.date() if _date else None
if _type == "datetime":
return date_casting(value)
raise ValueError(f"Unknown type `{_type}`")


def cast_df(df: pd.DataFrame, columns: dict, cast_json: bool = True, verbose: bool = False) -> pd.DataFrame:
if verbose:
start = time()
output_df = pd.DataFrame()
for col_name, detection in columns.items():
if detection["python_type"] == "string" or (detection["python_type"] == "json" and not cast_json):
# no change if detected type is string
output_df[col_name] = df[col_name].copy()
elif detection["python_type"] == "int":
# to allow having ints and NaN in the same column
output_df[col_name] = df[col_name].copy().astype(pd.Int64Dtype())
else:
output_df[col_name] = df[col_name].apply(
lambda col: cast(col, _type=detection["python_type"])
)
# to save RAM
del df[col_name]
if verbose:
display_logs_depending_process_time(
f'Casting columns completed in {round(time() - start, 3)}s',
time() - start,
)
return output_df
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
boto3==1.34.0
dateparser==1.2.0
faust-cchardet==2.1.19
pandas==2.2.0
pytest==8.3.0
Expand Down
19 changes: 19 additions & 0 deletions tests/test_fields.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import pandas as pd
from numpy import random
import pytest
from datetime import date as _date, datetime as _datetime

from csv_detective.detect_fields.FR.geo import (
adresse,
Expand Down Expand Up @@ -46,6 +48,7 @@
detetect_categorical_variable,
)
from csv_detective.explore_csv import return_all_tests
from csv_detective.utils import cast


def test_all_tests_return_bool():
Expand Down Expand Up @@ -504,3 +507,19 @@ def test_match_float():
def test_not_match_float():
for val in ["01053", "01053.89", "1e3", "123_456", "123_456.78", "+35", "+35.9"]:
assert not test_float._is(val)


@pytest.mark.parametrize(
"args",
(
("1.9", "float", float),
("oui", "bool", bool),
("[1, 2]", "json", list),
('{"a": 1}', "json", dict),
("2022-08-01", "date", _date),
("2024-09-23 17:32:07", "datetime", _datetime),
),
)
def test_cast(args):
value, detected_type, cast_type = args
assert isinstance(cast(value, detected_type), cast_type)
28 changes: 28 additions & 0 deletions tests/test_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,3 +232,31 @@ def test_output_df():
assert isinstance(output, dict)
assert isinstance(df, pd.DataFrame)
assert len(df) == 6
assert df["partly_empty"].dtype == pd.Int64Dtype()


@pytest.mark.parametrize(
"cast_json",
(
(True, dict),
(False, str),
),
)
def test_cast_json(mocked_responses, cast_json):
cast_json, expected_type = cast_json
expected_content = 'id,a_simple_dict\n1,{"a": 1}\n2,{"b": 2}\n3,{"c": 3}\n'
mocked_responses.get(
'http://example.com/test.csv',
body=expected_content,
status=200,
)
analysis, df = routine(
csv_file_path='http://example.com/test.csv',
num_rows=-1,
output_profile=False,
save_results=False,
output_df=True,
cast_json=cast_json,
)
assert analysis['columns']["a_simple_dict"]["python_type"] == "json"
assert isinstance(df["a_simple_dict"][0], expected_type)

0 comments on commit 0bc8d8b

Please sign in to comment.