mariotaddeucci · mariotaddeucci · Jan 17, 2025 · Jan 16, 2025 · Jan 17, 2025 · Jan 17, 2025
diff --git a/.github/workflows/pipeline.yaml b/.github/workflows/pipeline.yaml
@@ -15,25 +15,83 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
   cancel-in-progress: true
 
+
 jobs:
-  Pipeline:
+  BuildDocs:
+    name: Build Docs
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
 
-      - name: Set up Python 3.12
-        uses: actions/setup-python@v5
+      - name: Set up Python
+        uses: actions/setup-python@v4
         with:
-          python-version: "3.12"
+          python-version: '3.11'
 
-      - name: Set up UV
-        run: python -m pip install uv
+      - name: Setup UV
+        run: pip install uv
 
-      - name: Generate documentation
+      - name: Execute Tests
         run: make docs-build
 
-      - name: Execute lint checks
-        run: make lint
+  UnitTests:
+    name: Unit Tests
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.11'
 
-      - name: Execute tests
+      - name: Setup UV
+        run: pip install uv
+
+      - name: Execute Tests
         run: make test-all
+
+  StaticChecks:
+    name: Static Checks
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.11'
+
+      - name: Setup UV
+        run: pip install uv
+
+      - name: Execute Static Checks
+        run: make lint
+
+  Publish:
+    name: Publish
+    runs-on: ubuntu-latest
+    needs: [UnitTests, StaticChecks, BuildDocs]
+    environment:
+      name: pypi
+      url: https://pypi.org/p/gyjd
+    permissions:
+      id-token: write
+    if: startsWith(github.ref, 'refs/tags/v')
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.11'
+
+      - name: Setup UV
+        run: pip install uv
+
+      - name: Build package
+        run: make build
+
+      - name: Publish package distributions to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
+
diff --git a/README.md b/README.md
@@ -1 +1,37 @@
-# lazy_pandas
+# Lazy Pandas
+Lazy Pandas is a Python library that simplifies the use duckdb wrapping the pandas API. This library is not a pandas replacement, but a way to use the pandas API with DuckDB. Pandas is awesome and adopted by many people, but it is not the best tool for datasets that do not fit in memory. So why not give the power of duckdb to pandas users?
+
+## Installation
+
+To install Lazy Pandas, you can use pip:
+
+```sh
+pip install lazy-pandas
+```
+
+## Usage
+
+Here is a basic example of how to use Lazy Pandas:
+```python
+import lazy_pandas as lp
+
+df = lp.read_csv(location, parse_dates=["pickup_datetime"])
+df = df[["pickup_datetime", "passenger_count"]]
+df["pickup_date"] = df["pickup_datetime"].dt.date
+df = df.sort_values("pickup_date")
+df = df.collect()  # Materialize the lazy DataFrame to a pandas DataFrame
+```
+
+Features
+
+- Lazy evaluation
+- SQL support
+- Support for DuckDB extensions (e.g., Delta, Iceberg, etc.)
+
+Contribution
+
+Contributions are welcome! Feel free to open issues and pull requests.
+
+License
+
+This project is licensed under the MIT License - see the LICENSE file for details.
diff --git a/docs/docs/assets/profiler/lazy_pandas.json b/docs/docs/assets/profiler/lazy_pandas.json
@@ -14,7 +14,7 @@
     },
     "annotations": [
       {
-        "text": "Uso de memória ao longo do tempo (segundos)",
+        "text": "Memory usage over time (seconds)",
         "xref": "paper",
         "yref": "paper",
         "x": 0.5,

diff --git a/docs/docs/assets/profiler/pandas.json b/docs/docs/assets/profiler/pandas.json
@@ -15,7 +15,7 @@
     },
     "annotations": [
       {
-        "text": "Uso de memória ao longo do tempo (segundos)",
+        "text": "Memory usage over time (seconds)",
         "xref": "paper",
         "yref": "paper",
         "x": 0.5,

diff --git a/docs/docs/index.md b/docs/docs/index.md
@@ -1,37 +1,34 @@
 ---
 title: Lazy Pandas
 hide:
-  - navigation
-  - toc
+    - navigation
+    - toc
 ---
+Welcome to the Lazy Pandas official documentation! A library that allows you to use the pandas API with DuckDB as simple as a pip install.
 
-# Lazy Pandas
+To start using Lazy Pandas, you can install it using pip:
 
-Welcome to the **Lazy Pandas** official documentation!
-A library inspired by [pandas](https://pandas.pydata.org/) that focuses on *lazy* processing, enabling high performance and lower memory usage for large datasets.
+```sh
+pip install lazy-pandas
+```
 
 ## What is Lazy Pandas?
 
-Lazy Pandas is built on the concept of delaying DataFrame operations until they are strictly necessary (lazy evaluation). This allows:
-- Operations to be optimized in batches.
-- Memory usage to be minimized during processing.
-- Total runtime to be reduced for complex pipelines.
+LazyPandas is a wrapper around DuckDB that allows you to use the pandas API to interact with DuckDB. This library is not a pandas replacement, but a way to use the pandas API with DuckDB. Pandas is awesome and adopted by many people, but it is not the best tool for datasets that do not fit in memory. So why not give the power of duckdb to pandas users?
 
 ## Code Comparison
 
 Below is a side-by-side comparison showing how the same operation would look in **Pandas** versus **Lazy Pandas**:
 
-
 === "Lazy Pandas"
 
     ```python linenums="1" hl_lines="2 5 13"
     import pandas as pd
-    import lazy_pandas as lpd
+    import lazy_pandas as lp
 
     def read_taxi_dataset(location: str) -> pd.DataFrame:
-        df = lpd.read_csv(location, parse_dates=["pickup_datetime"])
+        df = lp.read_csv(location, parse_dates=["pickup_datetime"])
         df = df[["pickup_datetime", "passenger_count"]]
-        df["passenger_count"] = df["passenger_count"]
         df["pickup_date"] = df["pickup_datetime"].dt.date
         del df["pickup_datetime"]
         df = df.groupby("pickup_date").sum().reset_index()
@@ -41,7 +38,6 @@ Below is a side-by-side comparison showing how the same operation would look in
         return df
     ```
 
-
 === "Pandas"
 
     ```python linenums="1"
@@ -51,7 +47,6 @@ Below is a side-by-side comparison showing how the same operation would look in
     def read_taxi_dataset(location: str) -> pd.DataFrame:
         df = pd.read_csv(location, parse_dates=["pickup_datetime"])
         df = df[["pickup_datetime", "passenger_count"]]
-        df["passenger_count"] = df["passenger_count"]
         df["pickup_date"] = df["pickup_datetime"].dt.date
         del df["pickup_datetime"]
         df = df.groupby("pickup_date").sum().reset_index()
@@ -65,8 +60,7 @@ Notice that in traditional **pandas**, operations are executed immediately, whil
 
 ## Memory Usage
 
-Below is a fictitious performance comparison between **pandas** and **Lazy Pandas**, showing a scenario where a large dataset is processed in three stages (reading, aggregation, and complex filtering).
-
+Running the previous code on a 5.7GB CSV file with 55 million rows, we can see the memory usage difference between **Pandas** and **Lazy Pandas**:
 
 <div class="grid cards" markdown>
 ```plotly
@@ -78,4 +72,7 @@ Below is a fictitious performance comparison between **pandas** and **Lazy Panda
 ```
 </div>
 
+In the **Pandas** example, the memory usage spikes to 25.8GB and takes 8 minutes to complete, while in the **Lazy Pandas** example, the memory usage remains constant at 500mb and takes 6 seconds to complete.
+For the test, we used a MacBook Pro M1 with 16GB. The dataset used was the [NYC Taxi Dataset](https://www.kaggle.com/code/debjeetdas/nyc-taxi-fare-eda-prediction-using-linear-reg/input?select=train.csv) available on Kaggle.
+
 
diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
@@ -27,7 +27,6 @@ theme:
     - navigation.sections
     - toc.integrate
     - toc.follow
-    - content.action.edit
 plugins:
   #- include-markdown
   - plotly

diff --git a/docs/scripts/generate_references.py b/docs/scripts/generate_references.py
@@ -6,31 +6,31 @@
 package_dir = Path(__file__).parent.parent.parent / "src"
 sys.path.insert(0, str(package_dir))
 
-import lazy_pandas as lpd  # noqa: E402
+import lazy_pandas as lp  # noqa: E402
 
 vls = []
 
 vls += [
     (1000 + idx, "lazy_pandas.LazyFrame", f"LazyFrame.{attr}", attr)
-    for idx, attr in enumerate(sorted(dir(lpd.LazyFrame)))
+    for idx, attr in enumerate(sorted(dir(lp.LazyFrame)))
     if not attr.startswith("_")
 ]
 
 vls += [
     (1000 + idx, "lazy_pandas.LazyColumn", f"LazyColumn.{attr}", attr)
-    for idx, attr in enumerate(sorted(dir(lpd.LazyColumn)))
+    for idx, attr in enumerate(sorted(dir(lp.LazyColumn)))
     if not attr.startswith("_") and attr not in ["str", "dt", "create_from_function"]
 ]
 
 vls += [
     (2000 + idx, "lazy_pandas.LazyStringColumn", f"LazyColumn.str.{attr}", attr)
-    for idx, attr in enumerate(sorted(dir(lpd.LazyStringColumn)))
+    for idx, attr in enumerate(sorted(dir(lp.LazyStringColumn)))
     if not attr.startswith("_")
 ]
 
 vls += [
     (3000 + idx, "lazy_pandas.LazyDateTimeColumn", f"LazyColumn.dt.{attr}", attr)
-    for idx, attr in enumerate(sorted(dir(lpd.LazyDateTimeColumn)))
+    for idx, attr in enumerate(sorted(dir(lp.LazyDateTimeColumn)))
     if not attr.startswith("_")
 ]
 
@@ -50,15 +50,15 @@
 
 fn_names = [
     attr
-    for idx, attr in enumerate(sorted(dir(lpd)))
+    for idx, attr in enumerate(sorted(dir(lp)))
     if not attr.startswith("_")
-    and callable(getattr(lpd, attr))
+    and callable(getattr(lp, attr))
     and attr not in ["LazyFrame", "LazyColumn", "LazyStringColumn", "LazyDateTimeColumn"]
 ]
 
 
 template = """
-# lpd.{function_name}
+# lp.{function_name}
 ::: lazy_pandas.{function_name}
     options:
         members:

diff --git a/makefile b/makefile
@@ -1,6 +1,9 @@
 UVX = uvx
 MKDOCS_OPTS = --with-requirements requirements.txt
 
+build:
+	uv build
+
 test:
 	$(UVX) hatch test
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -9,7 +9,7 @@ requires = [
 dependencies = [
     "duckdb",
 ]
-description = "Add your description here"
+description = "The power of duckdb with the ease of pandas"
 dynamic = [
     "version",
 ]
@@ -40,7 +40,7 @@ extra-args = [
 run = "pytest{env:HATCH_TEST_ARGS:} {args}"
 
 [tool.hatch.version]
-source = "vcs"
+path = "src/lazy_pandas/__init__.py"
 
 [tool.pytest.ini_options]
 pythonpath = "src"

diff --git a/src/__init__.py b/src/__init__.py
diff --git a/src/lazy_pandas/__init__.py b/src/lazy_pandas/__init__.py
@@ -15,3 +15,5 @@
     "LazyDateTimeColumn",
     "LazyStringColumn",
 ]
+
+__version__ = "0.1.0dev1"