From a15e929fd61969a29353f54c24ba67d39304866a Mon Sep 17 00:00:00 2001 From: Mario Taddeucci Date: Wed, 15 Jan 2025 22:31:27 -0300 Subject: [PATCH 1/9] format and define version --- docs/docs/index.md | 4 ++-- docs/scripts/generate_references.py | 16 ++++++------- pyproject.toml | 2 +- src/__init__.py | 0 src/lazy_pandas/__init__.py | 2 ++ src/lazy_pandas/general.py | 24 +++++++++---------- tests/dataframe_test.py | 36 ++++++++++++++--------------- tests/reader_test.py | 10 ++++---- 8 files changed, 48 insertions(+), 46 deletions(-) delete mode 100644 src/__init__.py diff --git a/docs/docs/index.md b/docs/docs/index.md index c4a3910..1cbe06f 100644 --- a/docs/docs/index.md +++ b/docs/docs/index.md @@ -26,10 +26,10 @@ Below is a side-by-side comparison showing how the same operation would look in ```python linenums="1" hl_lines="2 5 13" import pandas as pd - import lazy_pandas as lpd + import lazy_pandas as lp def read_taxi_dataset(location: str) -> pd.DataFrame: - df = lpd.read_csv(location, parse_dates=["pickup_datetime"]) + df = lp.read_csv(location, parse_dates=["pickup_datetime"]) df = df[["pickup_datetime", "passenger_count"]] df["passenger_count"] = df["passenger_count"] df["pickup_date"] = df["pickup_datetime"].dt.date diff --git a/docs/scripts/generate_references.py b/docs/scripts/generate_references.py index 40bc859..a92acbc 100644 --- a/docs/scripts/generate_references.py +++ b/docs/scripts/generate_references.py @@ -6,31 +6,31 @@ package_dir = Path(__file__).parent.parent.parent / "src" sys.path.insert(0, str(package_dir)) -import lazy_pandas as lpd # noqa: E402 +import lazy_pandas as lp # noqa: E402 vls = [] vls += [ (1000 + idx, "lazy_pandas.LazyFrame", f"LazyFrame.{attr}", attr) - for idx, attr in enumerate(sorted(dir(lpd.LazyFrame))) + for idx, attr in enumerate(sorted(dir(lp.LazyFrame))) if not attr.startswith("_") ] vls += [ (1000 + idx, "lazy_pandas.LazyColumn", f"LazyColumn.{attr}", attr) - for idx, attr in enumerate(sorted(dir(lpd.LazyColumn))) + for idx, attr in enumerate(sorted(dir(lp.LazyColumn))) if not attr.startswith("_") and attr not in ["str", "dt", "create_from_function"] ] vls += [ (2000 + idx, "lazy_pandas.LazyStringColumn", f"LazyColumn.str.{attr}", attr) - for idx, attr in enumerate(sorted(dir(lpd.LazyStringColumn))) + for idx, attr in enumerate(sorted(dir(lp.LazyStringColumn))) if not attr.startswith("_") ] vls += [ (3000 + idx, "lazy_pandas.LazyDateTimeColumn", f"LazyColumn.dt.{attr}", attr) - for idx, attr in enumerate(sorted(dir(lpd.LazyDateTimeColumn))) + for idx, attr in enumerate(sorted(dir(lp.LazyDateTimeColumn))) if not attr.startswith("_") ] @@ -50,15 +50,15 @@ fn_names = [ attr - for idx, attr in enumerate(sorted(dir(lpd))) + for idx, attr in enumerate(sorted(dir(lp))) if not attr.startswith("_") - and callable(getattr(lpd, attr)) + and callable(getattr(lp, attr)) and attr not in ["LazyFrame", "LazyColumn", "LazyStringColumn", "LazyDateTimeColumn"] ] template = """ -# lpd.{function_name} +# lp.{function_name} ::: lazy_pandas.{function_name} options: members: diff --git a/pyproject.toml b/pyproject.toml index 629e2ba..236d206 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,7 +40,7 @@ extra-args = [ run = "pytest{env:HATCH_TEST_ARGS:} {args}" [tool.hatch.version] -source = "vcs" +path = "src/lazy_pandas/__init__.py" [tool.pytest.ini_options] pythonpath = "src" diff --git a/src/__init__.py b/src/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/lazy_pandas/__init__.py b/src/lazy_pandas/__init__.py index d96b277..133489e 100644 --- a/src/lazy_pandas/__init__.py +++ b/src/lazy_pandas/__init__.py @@ -15,3 +15,5 @@ "LazyDateTimeColumn", "LazyStringColumn", ] + +__version__ = "0.1.0" diff --git a/src/lazy_pandas/general.py b/src/lazy_pandas/general.py index 3d0e4aa..d4badc6 100644 --- a/src/lazy_pandas/general.py +++ b/src/lazy_pandas/general.py @@ -18,9 +18,9 @@ def from_pandas(df) -> LazyFrame: Example: ```python import pandas as pd - import lazy_pandas as lpd + import lazy_pandas as lp df = pd.DataFrame({'column1': [1, 2, 3], 'column2': ['a', 'b', 'c']}) - lazy_df = lpd.from_pandas(df) + lazy_df = lp.from_pandas(df) ``` """ return LazyFrame(duckdb.from_df(df)) @@ -115,8 +115,8 @@ def read_csv( Example: ```python - import lazy_pandas as lpd - df = lpd.read_csv('data.csv', header=True, sep=',', dtype={'column1': 'INTEGER', 'column2': 'VARCHAR'}) + import lazy_pandas as lp + df = lp.read_csv('data.csv', header=True, sep=',', dtype={'column1': 'INTEGER', 'column2': 'VARCHAR'}) df.head() ``` """ @@ -218,8 +218,8 @@ def read_json( Example: ```python - import lazy_pandas as lpd - df = lpd.read_json('data.json', columns={'userId': 'INTEGER', 'completed': 'BOOLEAN'}, format='array') + import lazy_pandas as lp + df = lp.read_json('data.json', columns={'userId': 'INTEGER', 'completed': 'BOOLEAN'}, format='array') df.head() ``` """ @@ -275,8 +275,8 @@ def read_parquet( Example: ```python - import lazy_pandas as lpd - df = lpd.read_parquet('data.parquet', columns=['column1', 'column2']) + import lazy_pandas as lp + df = lp.read_parquet('data.parquet', columns=['column1', 'column2']) df.head() ``` """ @@ -306,9 +306,9 @@ def read_delta(path: str, *, conn: duckdb.DuckDBPyConnection | None = None) -> L Example: ```python - import lazy_pandas as lpd + import lazy_pandas as lp from datetime import date - df = lpd.read_delta('s3://bucket/path_to_delta_table') + df = lp.read_delta('s3://bucket/path_to_delta_table') df.head() ``` """ @@ -331,8 +331,8 @@ def read_iceberg(path: str, *, conn: duckdb.DuckDBPyConnection | None = None) -> Example: ```python - import lazy_pandas as lpd - df = lpd.read_iceberg('s3://bucket/path_to_iceberg_table') + import lazy_pandas as lp + df = lp.read_iceberg('s3://bucket/path_to_iceberg_table') df.head() ``` """ diff --git a/tests/dataframe_test.py b/tests/dataframe_test.py index 560875e..38ee5bf 100644 --- a/tests/dataframe_test.py +++ b/tests/dataframe_test.py @@ -1,12 +1,12 @@ import duckdb -import lazy_pandas as lpd +import lazy_pandas as lp import numpy as np import pandas as pd def test_list_columns(): rel = duckdb.sql("SELECT 1 AS a, 2 AS b") - df = lpd.LazyFrame(rel) + df = lp.LazyFrame(rel) assert df.columns == ["a", "b"] for col_name in df.columns: assert isinstance(col_name, str) @@ -14,7 +14,7 @@ def test_list_columns(): def test_collect(): rel = duckdb.sql("SELECT 1 AS a, 2 AS b") - df = lpd.LazyFrame(rel) + df = lp.LazyFrame(rel) df = df.collect() assert isinstance(df, pd.DataFrame) assert df.shape == (1, 2) @@ -23,7 +23,7 @@ def test_collect(): def test_new_column(): rel = duckdb.sql("SELECT 1 AS a, 2 AS b") - df = lpd.LazyFrame(rel) + df = lp.LazyFrame(rel) df["c"] = 3 df = df.collect() assert df.shape == (1, 3) @@ -33,7 +33,7 @@ def test_new_column(): def test_overwrite_column(): rel = duckdb.sql("SELECT 1 AS a, 2 AS b") - df = lpd.LazyFrame(rel) + df = lp.LazyFrame(rel) df["a"] = 3 df = df.collect() assert df.shape == (1, 2) @@ -43,7 +43,7 @@ def test_overwrite_column(): def test_select_columns(): rel = duckdb.sql("SELECT 1 AS a, 2 AS b") - df = lpd.LazyFrame(rel) + df = lp.LazyFrame(rel) df = df[["b"]] df = df.collect() assert df.shape == (1, 1) @@ -52,7 +52,7 @@ def test_select_columns(): def test_head(): rel = duckdb.sql("SELECT 1 AS a, 2 AS b UNION ALL SELECT 3, 4") - df = lpd.LazyFrame(rel) + df = lp.LazyFrame(rel) df = df.head(1) df = df.collect() assert df.shape == (1, 2) @@ -61,7 +61,7 @@ def test_head(): def test_sort_values(): rel = duckdb.sql("SELECT 1 AS a, 2 AS b UNION ALL SELECT 3, 4") - df = lpd.LazyFrame(rel) + df = lp.LazyFrame(rel) df = df.sort_values("b") df = df.collect() assert df.shape == (2, 2) @@ -71,7 +71,7 @@ def test_sort_values(): def test_drop_duplicates(): rel = duckdb.sql("SELECT 1 AS a, 2 AS b UNION ALL SELECT 1, 2") - df = lpd.LazyFrame(rel) + df = lp.LazyFrame(rel) df = df.drop_duplicates() df = df.collect() assert df.shape == (1, 2) @@ -80,7 +80,7 @@ def test_drop_duplicates(): def test_drop_duplicates_subset(): rel = duckdb.sql("SELECT 1 AS a, 2 AS b UNION ALL SELECT 2, 2") - df = lpd.LazyFrame(rel) + df = lp.LazyFrame(rel) df = df.drop_duplicates(subset=["b"]) df = df.collect() assert df.shape == (1, 2) @@ -90,8 +90,8 @@ def test_drop_duplicates_subset(): def test_merge_inner(): rel1 = duckdb.sql("SELECT 1 AS a, 2 AS b") rel2 = duckdb.sql("SELECT 1 AS a, 4 AS d") - df1 = lpd.LazyFrame(rel1) - df2 = lpd.LazyFrame(rel2) + df1 = lp.LazyFrame(rel1) + df2 = lp.LazyFrame(rel2) df = df1.merge(df2, on="a") df = df.collect() @@ -102,8 +102,8 @@ def test_merge_inner(): def test_merge_outer(): rel1 = duckdb.sql("SELECT 1 AS a, 2 AS b") rel2 = duckdb.sql("SELECT 2 AS a, 4 AS d") - df1 = lpd.LazyFrame(rel1) - df2 = lpd.LazyFrame(rel2) + df1 = lp.LazyFrame(rel1) + df2 = lp.LazyFrame(rel2) df = df1.merge(df2, on="a", how="outer") df.sort_values("a", inplace=True) @@ -126,8 +126,8 @@ def test_merge_outer(): def test_merge_left(): rel1 = duckdb.sql("SELECT 1 AS a, 2 AS b UNION ALL SELECT 3, 3") rel2 = duckdb.sql("SELECT 1 AS a, 4 AS d UNION ALL SELECT 2, 5") - df1 = lpd.LazyFrame(rel1) - df2 = lpd.LazyFrame(rel2) + df1 = lp.LazyFrame(rel1) + df2 = lp.LazyFrame(rel2) df = df1.merge(df2, on="a", how="left") df.sort_values("a", inplace=True) @@ -151,8 +151,8 @@ def test_merge_left(): def test_merge_right(): rel1 = duckdb.sql("SELECT 1 AS a, 2 AS b UNION ALL SELECT 3, 3") rel2 = duckdb.sql("SELECT 1 AS a, 4 AS d UNION ALL SELECT 2, 5") - df1 = lpd.LazyFrame(rel2) - df2 = lpd.LazyFrame(rel1) + df1 = lp.LazyFrame(rel2) + df2 = lp.LazyFrame(rel1) df = df1.merge(df2, on="a", how="right") df.sort_values("a", inplace=True) diff --git a/tests/reader_test.py b/tests/reader_test.py index 39592ba..915725e 100644 --- a/tests/reader_test.py +++ b/tests/reader_test.py @@ -1,7 +1,7 @@ import os from tempfile import TemporaryDirectory -import lazy_pandas as lpd +import lazy_pandas as lp import pyarrow as pa import pytest from pyiceberg.catalog.sql import SqlCatalog @@ -35,24 +35,24 @@ def iceberg_table_uri(): def test_read_csv(): wheather_statition_uri = os.path.join(ASSETS_PATH, "weather_station.csv") - df = lpd.read_csv(wheather_statition_uri, sep=";") + df = lp.read_csv(wheather_statition_uri, sep=";") assert df.columns == ["city", "temperature"] def test_read_parquet(): wheather_statition_uri = os.path.join(ASSETS_PATH, "weather_station.parquet") - df = lpd.read_parquet(wheather_statition_uri, columns=["temperature", "city"]) + df = lp.read_parquet(wheather_statition_uri, columns=["temperature", "city"]) assert df.columns == ["temperature", "city"] def test_read_delta(): delta_table_uri = os.path.join(ASSETS_PATH, "delta_table") - df = lpd.read_delta(delta_table_uri) + df = lp.read_delta(delta_table_uri) assert df.columns == ["a", "b", "c"] def test_read_iceberg(iceberg_table_uri): - df = lpd.read_iceberg(iceberg_table_uri) + df = lp.read_iceberg(iceberg_table_uri) assert df.columns == ["lat", "long"] df = df.collect() assert df.shape == (3, 2) From 1a8a3da5d160a455f5b4829d8b04ef3415bc7c3f Mon Sep 17 00:00:00 2001 From: Mario Taddeucci Date: Thu, 16 Jan 2025 21:27:04 -0300 Subject: [PATCH 2/9] Prepare first release --- docs/docs/assets/profiler/lazy_pandas.json | 2 +- docs/docs/assets/profiler/pandas.json | 2 +- docs/docs/index.md | 23 +++++++------------ docs/mkdocs.yml | 1 - pyproject.toml | 2 +- .../column/lazy_datetime_column.py | 20 ++++++++-------- 6 files changed, 21 insertions(+), 29 deletions(-) diff --git a/docs/docs/assets/profiler/lazy_pandas.json b/docs/docs/assets/profiler/lazy_pandas.json index 5216e64..d8a1fe8 100644 --- a/docs/docs/assets/profiler/lazy_pandas.json +++ b/docs/docs/assets/profiler/lazy_pandas.json @@ -14,7 +14,7 @@ }, "annotations": [ { - "text": "Uso de memória ao longo do tempo (segundos)", + "text": "Memory usage over time (seconds)", "xref": "paper", "yref": "paper", "x": 0.5, diff --git a/docs/docs/assets/profiler/pandas.json b/docs/docs/assets/profiler/pandas.json index 1342613..3a37e75 100644 --- a/docs/docs/assets/profiler/pandas.json +++ b/docs/docs/assets/profiler/pandas.json @@ -15,7 +15,7 @@ }, "annotations": [ { - "text": "Uso de memória ao longo do tempo (segundos)", + "text": "Memory usage over time (seconds)", "xref": "paper", "yref": "paper", "x": 0.5, diff --git a/docs/docs/index.md b/docs/docs/index.md index 1cbe06f..5058d58 100644 --- a/docs/docs/index.md +++ b/docs/docs/index.md @@ -1,27 +1,19 @@ --- title: Lazy Pandas hide: - - navigation - - toc + - navigation + - toc --- - -# Lazy Pandas - -Welcome to the **Lazy Pandas** official documentation! -A library inspired by [pandas](https://pandas.pydata.org/) that focuses on *lazy* processing, enabling high performance and lower memory usage for large datasets. +Welcome to the Lazy Pandas official documentation! A library that allows you to use the pandas API with DuckDB. ## What is Lazy Pandas? -Lazy Pandas is built on the concept of delaying DataFrame operations until they are strictly necessary (lazy evaluation). This allows: -- Operations to be optimized in batches. -- Memory usage to be minimized during processing. -- Total runtime to be reduced for complex pipelines. +LazyPandas is a wrapper arround DuckDB that allows you to use the pandas API to interact with DuckDB. This library is not a pandas replacement, but a way to use the pandas API with DuckDB. Pandas is awesome and adopted by many people, but it is not the best tool for datasets that do not fit in memory. So why not give the power of duckdb to pandas users? ## Code Comparison Below is a side-by-side comparison showing how the same operation would look in **Pandas** versus **Lazy Pandas**: - === "Lazy Pandas" ```python linenums="1" hl_lines="2 5 13" @@ -41,7 +33,6 @@ Below is a side-by-side comparison showing how the same operation would look in return df ``` - === "Pandas" ```python linenums="1" @@ -65,8 +56,7 @@ Notice that in traditional **pandas**, operations are executed immediately, whil ## Memory Usage -Below is a fictitious performance comparison between **pandas** and **Lazy Pandas**, showing a scenario where a large dataset is processed in three stages (reading, aggregation, and complex filtering). - +Running the previous code on a 5.7GB CSV file with 55 million rows, we can see the memory usage difference between **Pandas** and **Lazy Pandas**:
```plotly @@ -78,4 +68,7 @@ Below is a fictitious performance comparison between **pandas** and **Lazy Panda ```
+In the **Pandas** example, the memory usage spikes to 25.8GB and takes 8 minutes to complete, while in the **Lazy Pandas** example, the memory usage remains constant at 500mb and takes 6 seconds to complete. +For the test, we used a MacBook Pro M1 with 16GB. The dataset used was the [NYC Taxi Dataset](https://www.kaggle.com/code/debjeetdas/nyc-taxi-fare-eda-prediction-using-linear-reg/input?select=train.csv) available on Kaggle. + diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 197c12a..5b1bbf8 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -27,7 +27,6 @@ theme: - navigation.sections - toc.integrate - toc.follow - - content.action.edit plugins: #- include-markdown - plotly diff --git a/pyproject.toml b/pyproject.toml index 236d206..203437b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,7 @@ requires = [ dependencies = [ "duckdb", ] -description = "Add your description here" +description = "The power of duckdb with the ease of pandas" dynamic = [ "version", ] diff --git a/src/lazy_pandas/column/lazy_datetime_column.py b/src/lazy_pandas/column/lazy_datetime_column.py index 0335aed..d9f320e 100644 --- a/src/lazy_pandas/column/lazy_datetime_column.py +++ b/src/lazy_pandas/column/lazy_datetime_column.py @@ -51,7 +51,7 @@ def date(self) -> "LazyColumn": # 1 2023-01-15 12:34:56 # 2 2023-03-31 23:59:59 - df["my_datetime_column"].dt.date() + df["my_datetime_column"].dt.date # Expected output (LazyColumn in lazy mode): # [2023-01-01, 2023-01-15, 2023-03-31, ...] ``` @@ -75,7 +75,7 @@ def year(self) -> "LazyColumn": # 1 2024-05-10 15:30:00 # 2 2025-12-31 23:59:59 - df["my_datetime_column"].dt.year() + df["my_datetime_column"].dt.year # [2023, 2024, 2025, ...] ``` """ @@ -100,7 +100,7 @@ def quarter(self) -> "LazyColumn": # 3 2023-07-10 10:10:10 # Q3 # 4 2023-10-25 11:11:11 # Q4 - df["my_datetime_column"].dt.quarter() + df["my_datetime_column"].dt.quarter # [1, 1, 2, 3, 4] ``` """ @@ -123,7 +123,7 @@ def month(self) -> "LazyColumn": # 1 2023-02-15 12:34:56 # 2 2023-12-31 23:59:59 - df["my_datetime_column"].dt.month() + df["my_datetime_column"].dt.month # [1, 2, 12, ...] ``` """ @@ -146,7 +146,7 @@ def day(self) -> "LazyColumn": # 1 2023-01-15 12:34:56 # day=15 # 2 2023-12-31 23:59:59 # day=31 - df["my_datetime_column"].dt.day() + df["my_datetime_column"].dt.day # [1, 15, 31, ...] ``` """ @@ -169,7 +169,7 @@ def is_month_start(self) -> "LazyColumn": # 1 2023-01-15 12:34:56 # 2 2023-02-01 00:00:00 # start of February - df["my_datetime_column"].dt.is_month_start() + df["my_datetime_column"].dt.is_month_start # [True, False, True, ...] ``` """ @@ -197,7 +197,7 @@ def is_quarter_start(self) -> "LazyColumn": # 2 2023-07-01 08:00:00 # start of Q3 # 3 2023-10-01 23:59:59 # start of Q4 - df["my_datetime_column"].dt.is_quarter_start() + df["my_datetime_column"].dt.is_quarter_start # [True, True, True, True, ...] ``` """ @@ -224,7 +224,7 @@ def is_year_start(self) -> "LazyColumn": # 1 2023-02-15 12:34:56 # 2 2024-01-01 23:59:59 # start of the next year - df["my_datetime_column"].dt.is_year_start() + df["my_datetime_column"].dt.is_year_start # [True, False, True, ...] ``` """ @@ -252,7 +252,7 @@ def is_month_end(self) -> "LazyColumn": # 2 2023-02-15 00:00:00 # 3 2023-02-28 23:59:59 # end of February (non leap year) - df["my_datetime_column"].dt.is_month_end() + df["my_datetime_column"].dt.is_month_end # [True, True, False, True, ...] ``` """ @@ -283,7 +283,7 @@ def weekday(self) -> "LazyColumn": # 2 2023-01-07 23:59:59 # Saturday # 3 2023-01-08 08:00:00 # Sunday - df["my_datetime_column"].dt.weekday() + df["my_datetime_column"].dt.weekday # [1, 2, 6, 7, ...] ``` """ From 0fc8c2043bd99ed102784b5a641feaea14d26192 Mon Sep 17 00:00:00 2001 From: Mario Taddeucci Date: Thu, 16 Jan 2025 21:44:03 -0300 Subject: [PATCH 3/9] Test --- README.md | 46 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 030adb5..4ff7a26 100644 --- a/README.md +++ b/README.md @@ -1 +1,45 @@ -# lazy_pandas +# Lazy Pandas +Lazy Pandas is a Python library that simplifies the use duckdb wrapping the pandas API. This library is not a pandas replacement, but a way to use the pandas API with DuckDB. Pandas is awesome and adopted by many people, but it is not the best tool for datasets that do not fit in memory. So why not give the power of duckdb to pandas users? + +## Installation + +To install Lazy Pandas, you can use pip: + +```sh +pip install lazy-pandas +``` + +## Usage + +Here is a basic example of how to use Lazy Pandas: + +import lazy_pandas as lp + +# Load a DataFrame +df = lp.read_csv('data.csv') + +# Display the first few rows of the DataFrame +lp.show(df) + +# Get descriptive statistics +lp.describe(df) + +Features + +- File Reading: Simplified functions for reading CSV, Excel, etc. +- Quick Visualization: Functions for quickly displaying DataFrames. +- Statistical Analysis: Functions for obtaining descriptive statistics and other analyses. + +Contribution + +Contributions are welcome! Feel free to open issues and pull requests. + +1. Fork the project +2. Create a branch for your feature (git checkout -b my-feature) +3. Commit your changes (git commit -am 'Add my feature') +4. Push to the branch (git push origin my-feature) +5. Open a Pull Request + +License + +This project is licensed under the MIT License - see the LICENSE file for details. From 0a52657c42d9509c9c8e3a426d92c55fd4342b3a Mon Sep 17 00:00:00 2001 From: Mario Taddeucci Date: Thu, 16 Jan 2025 22:06:47 -0300 Subject: [PATCH 4/9] Add license --- .github/workflows/pipeline.yaml | 68 +++++++++++++++++++++++++++------ LICENSE.txt | 21 ++++++++++ README.md | 28 +++++--------- docs/docs/index.md | 2 - makefile | 3 ++ src/lazy_pandas/__init__.py | 2 +- 6 files changed, 92 insertions(+), 32 deletions(-) create mode 100644 LICENSE.txt diff --git a/.github/workflows/pipeline.yaml b/.github/workflows/pipeline.yaml index 64ba010..6580f51 100644 --- a/.github/workflows/pipeline.yaml +++ b/.github/workflows/pipeline.yaml @@ -15,25 +15,71 @@ concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true + jobs: - Pipeline: + BuildDocs: + name: Build Docs runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - - name: Set up Python 3.12 - uses: actions/setup-python@v5 + - name: Install uv + uses: astral-sh/setup-uv@v3 with: - python-version: "3.12" - - - name: Set up UV - run: python -m pip install uv + enable-cache: true - - name: Generate documentation + - name: Execute Tests run: make docs-build - - name: Execute lint checks - run: make lint + UnitTests: + name: Unit Tests + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 - - name: Execute tests + - name: Install uv + uses: astral-sh/setup-uv@v3 + with: + enable-cache: true + + - name: Execute Tests run: make test-all + + StaticChecks: + name: Static Checks + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v3 + with: + enable-cache: true + + - name: Execute Static Checks + run: make lint + + Publish: + name: Publish + runs-on: ubuntu-latest + needs: [UnitTests, StaticChecks, BuildDocs] + environment: + name: pypi + url: https://pypi.org/p/gyjd + permissions: + id-token: write + if: startsWith(github.ref, 'refs/tags/v') + steps: + - uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v3 + with: + enable-cache: true + + - name: Build package + run: make build + + - name: Publish package distributions to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000..a4ca27c --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 Mario Taddeucci + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index 4ff7a26..c69d9ba 100644 --- a/README.md +++ b/README.md @@ -12,34 +12,26 @@ pip install lazy-pandas ## Usage Here is a basic example of how to use Lazy Pandas: - +```python import lazy_pandas as lp -# Load a DataFrame -df = lp.read_csv('data.csv') - -# Display the first few rows of the DataFrame -lp.show(df) - -# Get descriptive statistics -lp.describe(df) +df = lp.read_csv(location, parse_dates=["pickup_datetime"]) +df = df[["pickup_datetime", "passenger_count"]] +df["pickup_date"] = df["pickup_datetime"].dt.date +df = df.sort_values("pickup_date") +df = df.collect() # Materialize the lazy DataFrame to a pandas DataFrame +``` Features -- File Reading: Simplified functions for reading CSV, Excel, etc. -- Quick Visualization: Functions for quickly displaying DataFrames. -- Statistical Analysis: Functions for obtaining descriptive statistics and other analyses. +- Lazy evaluation +- SQL support +- Support for DuckDB extensions (e.g., Delta, Iceberg, etc.) Contribution Contributions are welcome! Feel free to open issues and pull requests. -1. Fork the project -2. Create a branch for your feature (git checkout -b my-feature) -3. Commit your changes (git commit -am 'Add my feature') -4. Push to the branch (git push origin my-feature) -5. Open a Pull Request - License This project is licensed under the MIT License - see the LICENSE file for details. diff --git a/docs/docs/index.md b/docs/docs/index.md index 5058d58..8a2882e 100644 --- a/docs/docs/index.md +++ b/docs/docs/index.md @@ -23,7 +23,6 @@ Below is a side-by-side comparison showing how the same operation would look in def read_taxi_dataset(location: str) -> pd.DataFrame: df = lp.read_csv(location, parse_dates=["pickup_datetime"]) df = df[["pickup_datetime", "passenger_count"]] - df["passenger_count"] = df["passenger_count"] df["pickup_date"] = df["pickup_datetime"].dt.date del df["pickup_datetime"] df = df.groupby("pickup_date").sum().reset_index() @@ -42,7 +41,6 @@ Below is a side-by-side comparison showing how the same operation would look in def read_taxi_dataset(location: str) -> pd.DataFrame: df = pd.read_csv(location, parse_dates=["pickup_datetime"]) df = df[["pickup_datetime", "passenger_count"]] - df["passenger_count"] = df["passenger_count"] df["pickup_date"] = df["pickup_datetime"].dt.date del df["pickup_datetime"] df = df.groupby("pickup_date").sum().reset_index() diff --git a/makefile b/makefile index 85035d5..e5edf71 100644 --- a/makefile +++ b/makefile @@ -1,6 +1,9 @@ UVX = uvx MKDOCS_OPTS = --with-requirements requirements.txt +build: + uv build + test: $(UVX) hatch test diff --git a/src/lazy_pandas/__init__.py b/src/lazy_pandas/__init__.py index 133489e..80ecf8b 100644 --- a/src/lazy_pandas/__init__.py +++ b/src/lazy_pandas/__init__.py @@ -16,4 +16,4 @@ "LazyStringColumn", ] -__version__ = "0.1.0" +__version__ = "0.1.0dev1" From 89e879f16f4d7d5f07755e6ee00615405650e4e1 Mon Sep 17 00:00:00 2001 From: Mario Taddeucci Date: Thu, 16 Jan 2025 22:07:37 -0300 Subject: [PATCH 5/9] one license file --- LICENSE.txt | 21 --------------------- 1 file changed, 21 deletions(-) delete mode 100644 LICENSE.txt diff --git a/LICENSE.txt b/LICENSE.txt deleted file mode 100644 index a4ca27c..0000000 --- a/LICENSE.txt +++ /dev/null @@ -1,21 +0,0 @@ -MIT License - -Copyright (c) 2023 Mario Taddeucci - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. From 7ded9f24e5d288b37a88cb357d129dcf100e22bc Mon Sep 17 00:00:00 2001 From: Mario Taddeucci Date: Thu, 16 Jan 2025 22:11:08 -0300 Subject: [PATCH 6/9] Add installation data --- docs/docs/index.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/docs/docs/index.md b/docs/docs/index.md index 8a2882e..0b7deda 100644 --- a/docs/docs/index.md +++ b/docs/docs/index.md @@ -4,7 +4,13 @@ hide: - navigation - toc --- -Welcome to the Lazy Pandas official documentation! A library that allows you to use the pandas API with DuckDB. +Welcome to the Lazy Pandas official documentation! A library that allows you to use the pandas API with DuckDB as simple as a pip install. + +To start using Lazy Pandas, you can install it using pip: + +```sh +pip install lazy-pandas +``` ## What is Lazy Pandas? From 109286c9cad4e55d464e705847b8a79e993d41bc Mon Sep 17 00:00:00 2001 From: Mario Taddeucci Date: Fri, 17 Jan 2025 06:13:33 -0300 Subject: [PATCH 7/9] Fix pipeline --- .github/workflows/pipeline.yaml | 36 ++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/.github/workflows/pipeline.yaml b/.github/workflows/pipeline.yaml index 6580f51..1791083 100644 --- a/.github/workflows/pipeline.yaml +++ b/.github/workflows/pipeline.yaml @@ -23,10 +23,13 @@ jobs: steps: - uses: actions/checkout@v4 - - name: Install uv - uses: astral-sh/setup-uv@v3 + - name: Set up Python + uses: actions/setup-python@v4 with: - enable-cache: true + python-version: '3.11' + + - name: Setup UV + run: make build - name: Execute Tests run: make docs-build @@ -37,10 +40,13 @@ jobs: steps: - uses: actions/checkout@v4 - - name: Install uv - uses: astral-sh/setup-uv@v3 + - name: Set up Python + uses: actions/setup-python@v4 with: - enable-cache: true + python-version: '3.11' + + - name: Setup UV + run: make build - name: Execute Tests run: make test-all @@ -51,10 +57,13 @@ jobs: steps: - uses: actions/checkout@v4 - - name: Install uv - uses: astral-sh/setup-uv@v3 + - name: Set up Python + uses: actions/setup-python@v4 with: - enable-cache: true + python-version: '3.11' + + - name: Setup UV + run: make build - name: Execute Static Checks run: make lint @@ -72,10 +81,13 @@ jobs: steps: - uses: actions/checkout@v4 - - name: Install uv - uses: astral-sh/setup-uv@v3 + - name: Set up Python + uses: actions/setup-python@v4 with: - enable-cache: true + python-version: '3.11' + + - name: Setup UV + run: make build - name: Build package run: make build From fda3b8797c5cbd20e46809c1c60e67742333770b Mon Sep 17 00:00:00 2001 From: Mario Taddeucci Date: Fri, 17 Jan 2025 06:19:27 -0300 Subject: [PATCH 8/9] Fix pipeline --- .github/workflows/pipeline.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/pipeline.yaml b/.github/workflows/pipeline.yaml index 1791083..6641d1c 100644 --- a/.github/workflows/pipeline.yaml +++ b/.github/workflows/pipeline.yaml @@ -29,7 +29,7 @@ jobs: python-version: '3.11' - name: Setup UV - run: make build + run: pip install uv - name: Execute Tests run: make docs-build @@ -46,7 +46,7 @@ jobs: python-version: '3.11' - name: Setup UV - run: make build + run: pip install uv - name: Execute Tests run: make test-all @@ -63,7 +63,7 @@ jobs: python-version: '3.11' - name: Setup UV - run: make build + run: pip install uv - name: Execute Static Checks run: make lint @@ -87,7 +87,7 @@ jobs: python-version: '3.11' - name: Setup UV - run: make build + run: pip install uv - name: Build package run: make build From 7907899a18e8e6a64f86afed4e01cd933935a287 Mon Sep 17 00:00:00 2001 From: Mario Taddeucci Date: Fri, 17 Jan 2025 06:21:07 -0300 Subject: [PATCH 9/9] fix typo --- docs/docs/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/docs/index.md b/docs/docs/index.md index 0b7deda..69a134f 100644 --- a/docs/docs/index.md +++ b/docs/docs/index.md @@ -14,7 +14,7 @@ pip install lazy-pandas ## What is Lazy Pandas? -LazyPandas is a wrapper arround DuckDB that allows you to use the pandas API to interact with DuckDB. This library is not a pandas replacement, but a way to use the pandas API with DuckDB. Pandas is awesome and adopted by many people, but it is not the best tool for datasets that do not fit in memory. So why not give the power of duckdb to pandas users? +LazyPandas is a wrapper around DuckDB that allows you to use the pandas API to interact with DuckDB. This library is not a pandas replacement, but a way to use the pandas API with DuckDB. Pandas is awesome and adopted by many people, but it is not the best tool for datasets that do not fit in memory. So why not give the power of duckdb to pandas users? ## Code Comparison