Return dataframe with columns properly cast (#104)

* feat: add dateparser * refactor: more robust date detection * feat: add datetime to fields * feat: create bool casting function * feat: add datetime to init * feat: add more date labels * refactor: consistency renaming * feat: add df casting process * fix: better label test * feat: add cast test * refactor: cast column as int * feat: tests * fix: use optional instead of pipe * docs: update changelog * docs: hint type Co-authored-by: Adrien Carpentier <[email protected]> * docs: hint type Co-authored-by: Adrien Carpentier <[email protected]> * docs: hint type Co-authored-by: Adrien Carpentier <[email protected]> * docs: hint type Co-authored-by: Adrien Carpentier <[email protected]> * fix: add missing import * fix: cast specific datetime formats to datetime * feat: add option to not cast json columns --------- Co-authored-by: Adrien Carpentier <[email protected]>
datagouv · Dec 16, 2024 · 0bc8d8b · 0bc8d8b
1 parent abe0b14
commit 0bc8d8b
Show file tree

Hide file tree

Showing 12 changed files with 187 additions and 61 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,7 @@
 - New function that creates a csv from a list of fields and constraints, or from a TableSchema [#101](https://github.com/datagouv/csv-detective/pull/101)
 - Enable outputing loaded dataframe [#102](https://github.com/datagouv/csv-detective/pull/102)
 - Better naming, hint types and minor refactors [#103](https://github.com/datagouv/csv-detective/pull/103)
+- The returned dataframe has its columns properly cast to the detected types [#104](https://github.com/datagouv/csv-detective/pull/104)
 
 ## 0.7.4 (2024-11-15)
 

diff --git a/csv_detective/detect_fields/__init__.py b/csv_detective/detect_fields/__init__.py
@@ -54,4 +54,4 @@
 )
 
 from .FR.temp import jour_de_la_semaine, mois_de_annee
-from .temp import year, date, datetime_iso, datetime_rfc822
+from .temp import year, date, datetime, datetime_iso, datetime_rfc822
diff --git a/csv_detective/detect_fields/other/booleen/__init__.py b/csv_detective/detect_fields/other/booleen/__init__.py
@@ -1,21 +1,27 @@
 PROPORTION = 1
-liste_bool = {
-    '0',
-    '1',
-    'vrai',
-    'faux',
-    'true',
-    'false',
-    'oui',
-    'non',
-    'yes',
-    'no',
-    'y',
-    'n',
-    'o'
+bool_mapping = {
+    "1": True,
+    "0": False,
+    "vrai": True,
+    "faux": False,
+    "true": True,
+    "false": False,
+    "oui": True,
+    "non": False,
+    "yes": True,
+    "no": False,
+    "y": True,
+    "n": False,
+    "o": True,
 }
 
+liste_bool = set(bool_mapping.keys())
 
-def _is(val):
-    '''Détection les booléens'''
+
+def bool_casting(val: str) -> bool:
+    return bool_mapping.get(val)
+
+
+def _is(val: str) -> bool:
+    '''Détecte les booléens'''
     return isinstance(val, str) and val.lower() in liste_bool
diff --git a/csv_detective/detect_fields/other/float/__init__.py b/csv_detective/detect_fields/other/float/__init__.py
@@ -1,8 +1,8 @@
 PROPORTION = 1
 
 
-def float_casting(str2cast):
-    return float(str2cast.replace(',', '.'))
+def float_casting(val: str) -> float:
+    return float(val.replace(',', '.'))
 
 
 def _is(val):

diff --git a/csv_detective/detect_fields/temp/date/__init__.py b/csv_detective/detect_fields/temp/date/__init__.py
@@ -1,46 +1,30 @@
-import re
-from dateutil.parser import parse, ParserError
-from csv_detective.detect_fields.other.float import _is as is_float
-from unidecode import unidecode
+from datetime import datetime
+from typing import Optional
+
+from dateparser import parse as date_parser
+from dateutil.parser import parse as dateutil_parser, ParserError
 
 PROPORTION = 1
 # /!\ this is only for dates, not datetimes which are handled by other utils
 
 
-def is_dateutil_date(val: str) -> bool:
-    # we don't want to get datetimes here, so length restriction
-    # longest date string expected here is DD-septembre-YYYY, so 17 characters
-    if len(val) > 17:
-        return False
+def date_casting(val: str) -> Optional[datetime]:
+    """For performance reasons, we try first with dateutil and fallback on dateparser"""
     try:
-        res = parse(val, fuzzy=False)
-        if res.hour or res.minute or res.second:
-            return False
-        return True
-    except (ParserError, ValueError, TypeError, OverflowError):
-        return False
-
-
-seps = r'[\s/\-\*_\|;.,]'
-# matches JJ-MM-AAAA with any of the listed separators
-pat = r'^(0[1-9]|[12][0-9]|3[01])SEP(0[1-9]|1[0-2])SEP((19|20)\d{2})$'.replace('SEP', seps)
-# matches AAAA-MM-JJ with any of the listed separators OR NO SEPARATOR
-tap = r'^((19|20)\d{2})SEP(0[1-9]|1[0-2])SEP(0[1-9]|[12][0-9]|3[01])$'.replace('SEP', seps + '?')
-# matches JJ-mmm-AAAA and JJ-mmm...mm-AAAA with any of the listed separators OR NO SEPARATOR
-letters = (
-    r'^(0[1-9]|[12][0-9]|3[01])SEP(jan|fev|feb|mar|avr|apr'
-    r'|mai|may|jun|jui|jul|aou|aug|sep|oct|nov|dec|janvier|fevrier|mars|avril|'
-    r'mai|juin|jullet|aout|septembre|octobre|novembre|decembre)SEP'
-    r'(\d{2}|\d{4})$'
-).replace('SEP', seps + '?')
+        return dateutil_parser(val)
+    except ParserError:
+        return date_parser(val)
 
 
 def _is(val):
-    '''Renvoie True si val peut être une date, False sinon
-    On ne garde que les regex pour les cas où parse() ne convient pas'''
-    return isinstance(val, str) and (
-        (is_dateutil_date(val) and not is_float(val))
-        or bool(re.match(letters, unidecode(val)))
-        or bool(re.match(pat, val))
-        or bool(re.match(tap, val))
-    )
+    '''Renvoie True si val peut être une date, False sinon'''
+    # early stops, to cut processing time
+    if not isinstance(val, str) or len(val) > 20 or len(val) < 8:
+        return False
+    threshold = 0.3
+    if sum([char.isdigit() for char in val]) / len(val) < threshold:
+        return False
+    res = date_casting(val)
+    if not res or res.hour or res.minute or res.second:
+        return False
+    return True
diff --git a/csv_detective/detect_fields/temp/datetime/__init__.py b/csv_detective/detect_fields/temp/datetime/__init__.py
@@ -0,0 +1,19 @@
+from typing import Any, Optional
+
+from csv_detective.detect_fields.temp.date import date_casting
+
+PROPORTION = 1
+
+
+def _is(val: Optional[Any]) -> bool:
+    '''Renvoie True si val peut être un datetime, False sinon'''
+    # early stops, to cut processing time
+    if not isinstance(val, str) or len(val) > 30 or len(val) < 15:
+        return False
+    threshold = 0.7
+    if sum([char.isdigit() for char in val]) / len(val) < threshold:
+        return False
+    res = date_casting(val)
+    if res and (res.hour or res.minute or res.second):
+        return True
+    return False
diff --git a/csv_detective/detect_labels/temp/date/__init__.py b/csv_detective/detect_labels/temp/date/__init__.py
@@ -27,7 +27,9 @@ def _is(header):
         'dateouv',
         'date der maj',
         'dmaj',
-        'jour'
+        'jour',
+        'yyyymmdd',
+        'aaaammjj',
     ]
     processed_header = _process_text(header)
 

diff --git a/csv_detective/explore_csv.py b/csv_detective/explore_csv.py
@@ -19,7 +19,13 @@
 from csv_detective import detect_fields, detect_labels
 from csv_detective.s3_utils import download_from_minio, upload_to_minio
 from csv_detective.schema_generation import generate_table_schema
-from csv_detective.utils import test_col, test_label, prepare_output_dict, display_logs_depending_process_time
+from csv_detective.utils import (
+    cast_df,
+    display_logs_depending_process_time,
+    prepare_output_dict,
+    test_col,
+    test_label,
+)
 from .detection import (
     detect_engine,
     detect_separator,
@@ -111,6 +117,7 @@ def routine(
     output_profile: bool = False,
     output_schema: bool = False,
     output_df: bool = False,
+    cast_json: bool = True,
     verbose: bool = False,
     sheet_name: Union[str, int] = None,
 ) -> Union[dict, tuple[dict, pd.DataFrame]]:
@@ -127,6 +134,7 @@ def routine(
         output_profile: whether or not to add the 'profile' field to the output
         output_schema: whether or not to add the 'schema' field to the output (tableschema)
         output_df: whether or not to return the loaded DataFrame along with the analysis report
+        cast_json: whether or not to cast json columns into objects (otherwise they are returned as strings)
         verbose: whether or not to print process logs in console 
         sheet_name: if reading multi-sheet file (xls-like), which sheet to consider
         skipna: whether to keep NaN (empty cells) for tests
@@ -276,6 +284,8 @@ def routine(
         "json": "json",
         "json_geojson": "json",
         "datetime": "datetime",
+        "datetime_iso": "datetime",
+        "datetime_rfc822": "datetime",
         "date": "date",
         "latitude": "float",
         "latitude_l93": "float",
@@ -352,7 +362,12 @@ def routine(
             time() - start_routine
         )
     if output_df:
-        return analysis, table
+        return analysis, cast_df(
+            df=table,
+            columns=analysis["columns"],
+            cast_json=cast_json,
+            verbose=verbose,
+        )
     return analysis
 
 

diff --git a/csv_detective/utils.py b/csv_detective/utils.py
@@ -1,7 +1,13 @@
-from typing import Callable
+from typing import Callable, Optional, Union
+import json
 import pandas as pd
 import logging
 from time import time
+from datetime import date, datetime
+
+from csv_detective.detect_fields.other.booleen import bool_casting
+from csv_detective.detect_fields.other.float import float_casting
+from csv_detective.detect_fields.temp.date import date_casting
 
 logging.basicConfig(level=logging.INFO)
 
@@ -210,7 +216,52 @@ def prepare_output_dict(return_table: pd.DataFrame, limited_output: bool):
 
 def full_word_strictly_inside_string(word: str, string: str):
     return (
-        (" " + word + " " in string)
+        word == string
+        or (" " + word + " " in string)
         or (string.startswith(word + " "))
         or (string.endswith(" " + word))
     )
+
+
+def cast(value: str, _type: str) -> Optional[Union[str, float, bool, date, datetime]]:
+    if not isinstance(value, str) or not value:
+        # None is the current default value in hydra, should we keep this?
+        return None
+    if _type == "float":
+        return float_casting(value)
+    if _type == "bool":
+        return bool_casting(value)
+    if _type == "json":
+        # in hydra json are given to postgres as strings, conversion is done by postgres
+        return json.loads(value)
+    if _type == "date":
+        _date = date_casting(value)
+        return _date.date() if _date else None
+    if _type == "datetime":
+        return date_casting(value)
+    raise ValueError(f"Unknown type `{_type}`")
+
+
+def cast_df(df: pd.DataFrame, columns: dict, cast_json: bool = True, verbose: bool = False) -> pd.DataFrame:
+    if verbose:
+        start = time()
+    output_df = pd.DataFrame()
+    for col_name, detection in columns.items():
+        if detection["python_type"] == "string" or (detection["python_type"] == "json" and not cast_json):
+            # no change if detected type is string
+            output_df[col_name] = df[col_name].copy()
+        elif detection["python_type"] == "int":
+            # to allow having ints and NaN in the same column
+            output_df[col_name] = df[col_name].copy().astype(pd.Int64Dtype())
+        else:
+            output_df[col_name] = df[col_name].apply(
+                lambda col: cast(col, _type=detection["python_type"])
+            )
+        # to save RAM
+        del df[col_name]
+    if verbose:
+        display_logs_depending_process_time(
+            f'Casting columns completed in {round(time() - start, 3)}s',
+            time() - start,
+        )
+    return output_df
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,5 @@
 boto3==1.34.0
+dateparser==1.2.0
 faust-cchardet==2.1.19
 pandas==2.2.0
 pytest==8.3.0

diff --git a/tests/test_fields.py b/tests/test_fields.py
@@ -1,5 +1,7 @@
 import pandas as pd
 from numpy import random
+import pytest
+from datetime import date as _date, datetime as _datetime
 
 from csv_detective.detect_fields.FR.geo import (
     adresse,
@@ -46,6 +48,7 @@
     detetect_categorical_variable,
 )
 from csv_detective.explore_csv import return_all_tests
+from csv_detective.utils import cast
 
 
 def test_all_tests_return_bool():
@@ -504,3 +507,19 @@ def test_match_float():
 def test_not_match_float():
     for val in ["01053", "01053.89", "1e3", "123_456", "123_456.78", "+35", "+35.9"]:
         assert not test_float._is(val)
+
+
+@pytest.mark.parametrize(
+    "args",
+    (
+        ("1.9", "float", float),
+        ("oui", "bool", bool),
+        ("[1, 2]", "json", list),
+        ('{"a": 1}', "json", dict),
+        ("2022-08-01", "date", _date),
+        ("2024-09-23 17:32:07", "datetime", _datetime),
+    ),
+)
+def test_cast(args):
+    value, detected_type, cast_type = args
+    assert isinstance(cast(value, detected_type), cast_type)
diff --git a/tests/test_file.py b/tests/test_file.py
@@ -232,3 +232,31 @@ def test_output_df():
     assert isinstance(output, dict)
     assert isinstance(df, pd.DataFrame)
     assert len(df) == 6
+    assert df["partly_empty"].dtype == pd.Int64Dtype()
+
+
+@pytest.mark.parametrize(
+    "cast_json",
+    (
+        (True, dict),
+        (False, str),
+    ),
+)
+def test_cast_json(mocked_responses, cast_json):
+    cast_json, expected_type = cast_json
+    expected_content = 'id,a_simple_dict\n1,{"a": 1}\n2,{"b": 2}\n3,{"c": 3}\n'
+    mocked_responses.get(
+        'http://example.com/test.csv',
+        body=expected_content,
+        status=200,
+    )
+    analysis, df = routine(
+        csv_file_path='http://example.com/test.csv',
+        num_rows=-1,
+        output_profile=False,
+        save_results=False,
+        output_df=True,
+        cast_json=cast_json,
+    )
+    assert analysis['columns']["a_simple_dict"]["python_type"] == "json"
+    assert isinstance(df["a_simple_dict"][0], expected_type)