docs: linked mkdocs & api docs (#3703)

This PR will bump docs-v2 to be the main docs dir. All the User Guide content will be hosted on mkdocs and all the API Docs remain on sphinx docs, with relative paths in between. Docs successfully builds on readthedocs. `make docs` will build both docs locally
Eventual-Inc · Jan 28, 2025 · a8d63dd · a8d63dd
1 parent 45f27c5
commit a8d63dd
Show file tree

Hide file tree

Showing 123 changed files with 349 additions and 9,287 deletions.
diff --git a/.gitignore b/.gitignore
@@ -31,9 +31,6 @@ log/
 # pytest benchmarks
 .benchmarks
 
-# docs autogen
-/docs/source/api_docs/doc_gen/
-
 # Added by pyenv
 .python-version
 

diff --git a/.readthedocs.yaml b/.readthedocs.yaml
@@ -1,16 +1,20 @@
 version: 2
 
 build:
-  os: ubuntu-22.04
+  os: "ubuntu-24.04"
   tools:
     python: '3.10'
     rust: '1.64'
   jobs:
     pre_build:
     - VIRTUAL_ENV=${READTHEDOCS_VIRTUALENV_PATH} make VENV=${READTHEDOCS_VIRTUALENV_PATH} build
 
-sphinx:
-  configuration: docs/source/conf.py
+python:
+  install:
+  - requirements: requirements-docs.txt
+
+mkdocs:
+  configuration: docs/mkdocs.yml
 
 formats:
 - htmlzip
diff --git a/Makefile b/Makefile
@@ -71,8 +71,12 @@ dsdgen: .venv ## Generate TPC-DS data
 	$(VENV_BIN)/python benchmarking/tpcds/datagen.py --scale-factor=$(SCALE_FACTOR) --tpcds-gen-folder=$(OUTPUT_DIR)
 
 .PHONY: docs
-docs: .venv ## Serve docs
-	uv run --with-requirements requirements-docs.txt mkdocs serve
+docs: .venv sphinx-docs ## Build both MkDocs and Sphinx documentation
+	JUPYTER_PLATFORM_DIRS=1 uv run --with-requirements requirements-docs.txt mkdocs build -f docs/mkdocs.yml
+
+.PHONY: sphinx-docs
+sphinx-docs: .venv ## Build Sphinx API documentation
+	uv run --with-requirements requirements-docs.txt sphinx-build -b html "docs/sphinx/source" "docs/sphinx/_build"
 
 .PHONY: clean
 clean:

diff --git a/daft/expressions/expressions.py b/daft/expressions/expressions.py
@@ -134,7 +134,7 @@ def lit(value: object) -> Expression:
 def col(name: str) -> Expression:
     """Creates an Expression referring to the column with the provided name.
 
-    See :ref:`Column Wildcards` for details on wildcards.
+    See `Column Wildcards <../../../core_concepts/#selecting-columns-using-wildcards>`_ for details on wildcards.
 
     Example:
         >>> import daft

diff --git a/docs-v2/img/daft_diagram.png b/docs-v2/img/daft_diagram.png
diff --git a/docs/.gitignore b/docs/.gitignore
@@ -0,0 +1,4 @@
+sphinx/_build
+sphinx/source/doc_gen/
+site
+mkdocs/api_docs
diff --git a/docs/Makefile b/docs/Makefile
@@ -5,16 +5,16 @@
 # from the environment for the first two.
 SPHINXOPTS    ?=
 SPHINXBUILD   ?= sphinx-build
-SOURCEDIR     = source
-BUILDDIR      = build
+SPHINX_SOURCEDIR     = sphinx/source
+SPHINX_BUILDDIR      = sphinx/_build
 
 # Put it first so that "make" without argument is like "make help".
 help:
-	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+	@$(SPHINXBUILD) -M help "$(SPHINX_SOURCEDIR)" "$(SPHINX_BUILDDIR)" $(SPHINXOPTS) $(O)
 
 .PHONY: help Makefile
 
 # Catch-all target: route all unknown targets to Sphinx using the new
 # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
 %: Makefile
-	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+	@$(SPHINXBUILD) -M $@ "$(SPHINX_SOURCEDIR)" "$(SPHINX_BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/hooks.py b/docs/hooks.py
@@ -0,0 +1,26 @@
+import os
+import shutil
+import subprocess
+
+
+def make_api_docs(*args, **kwargs):
+    env = os.environ.copy()
+    env["PATH"] = f"{os.path.abspath('.venv/bin')}:{env['PATH']}"
+
+    # Run sphinx-build directly instead of using make
+    venv_path = os.getenv("READTHEDOCS_VIRTUALENV_PATH", ".venv")
+    sphinx_build = os.path.join(os.path.abspath(f"{venv_path}/bin"), "sphinx-build")
+    subprocess.run(
+        [
+            sphinx_build,
+            "-b",
+            "html",
+            "docs/sphinx/source",  # source dir
+            "docs/sphinx/_build",  # build dir
+        ],
+        check=True,
+        env=env,
+    )
+
+    # Copy built docs to mkdocs directory
+    shutil.copytree("docs/sphinx/_build", "docs/mkdocs/api_docs", dirs_exist_ok=True)
diff --git a/mkdocs.yml → docs/mkdocs.yml b/mkdocs.yml → docs/mkdocs.yml
@@ -4,12 +4,12 @@
 # Project Information
 site_name: Daft Documentation
 
-docs_dir: docs-v2
+docs_dir: mkdocs
 
 # Scarf pixel for tracking analytics
-image:
-  referrerpolicy: "no-referrer-when-downgrade"
-  src: "https://static.scarf.sh/a.png?x-pxid=c9065f3a-a090-4243-8f69-145d5de7bfca"
+# image:
+#   referrerpolicy: "no-referrer-when-downgrade"
+#   src: "https://static.scarf.sh/a.png?x-pxid=c9065f3a-a090-4243-8f69-145d5de7bfca"
 
 
 # Repository
@@ -46,7 +46,7 @@ nav:
     - Telemetry: resources/telemetry.md
   - Migration Guide:
     - Coming from Dask: migration/dask_migration.md
-- API Docs
+- API Docs: api_docs/index.html
 
 # Configuration
 theme:
@@ -82,15 +82,15 @@ theme:
       name: Switch to light mode
   - media: "(prefers-color-scheme: light)"
     scheme: default
-    primary: indigo
-    accent: indigo
+    primary: custom
+    accent: custom
     toggle:
       icon: material/weather-night
       name: Switch to dark mode
   - media: "(prefers-color-scheme: dark)"
     scheme: slate
     primary: black
-    accent: indigo
+    accent: custom
     toggle:
       icon: material/theme-light-dark
       name: Switch to system preference
@@ -110,6 +110,13 @@ extra:
   - icon: fontawesome/brands/x-twitter
     link: https://x.com/daft_dataframe
 
+  # This is a macro you should use to refer to paths
+  # When referring to methods, the syntax is {{ api_path }}/path/to/method
+  api_path: api_docs/doc_gen
+
+extra_css:
+- css/extra.css
+
 # Extensions
 markdown_extensions:
 - admonition
@@ -139,3 +146,7 @@ plugins:
     include_source: true
 - search:
     separator: '[\s\u200b\-_,:!=\[\]()"`/]+|\.(?!\d)|&[lg]t;|(?!\b)(?=[A-Z][a-z])'
+- macros
+- mkdocs-simple-hooks:
+    hooks:
+      on_pre_build: "docs.hooks:make_api_docs"
diff --git a/docs-v2/10min.ipynb → docs/mkdocs/10min.ipynb b/docs-v2/10min.ipynb → docs/mkdocs/10min.ipynb
diff --git a/docs-v2/advanced/memory.md → docs/mkdocs/advanced/memory.md b/docs-v2/advanced/memory.md → docs/mkdocs/advanced/memory.md
@@ -59,7 +59,7 @@ There are some options available to you.
 
 3. Aggressively filter your data so that Daft can avoid reading data that it does not have to (e.g. `df.where(...)`)
 
-4. Request more memory for your UDFs (see [Resource Requests](../core_concepts/udf.md#resource-requests) if your UDFs are memory intensive (e.g. decompression of data, running large matrix computations etc)
+4. Request more memory for your UDFs (see [Resource Requests](../core_concepts.md#resource-requests) if your UDFs are memory intensive (e.g. decompression of data, running large matrix computations etc)
 
 5. Increase the number of partitions in your dataframe (hence making each partition smaller) using something like: `df.into_partitions(df.num_partitions() * 2)`
 

diff --git a/docs-v2/advanced/partitioning.md → docs/mkdocs/advanced/partitioning.md b/docs-v2/advanced/partitioning.md → docs/mkdocs/advanced/partitioning.md
@@ -33,7 +33,9 @@ General rule of thumb:
 Daft will automatically use certain heuristics to determine the number of partitions for you when you create a DataFrame. When reading data from files (e.g. Parquet, CSV or JSON), Daft will group small files/split large files appropriately
 into nicely-sized partitions based on their estimated in-memory data sizes.
 
-To interrogate the partitioning of your current DataFrame, you may use the [`df.explain(show_all=True)`](https://www.getdaft.io/projects/docs/en/stable/api_docs/doc_gen/dataframe_methods/daft.DataFrame.explain.html#daft.DataFrame.explain) method. Here is an example output from a simple `df = daft.read_parquet(...)` call on a fairly large number of Parquet files.
+To interrogate the partitioning of your current DataFrame, you may use the [`df.explain(show_all=True)`](../{{ api_path }}/dataframe_methods/daft.DataFrame.explain.html) method. Here is an example output from a simple `df = daft.read_parquet(...)` call on a fairly large number of Parquet files.
+
+[`df.explain(show_all=True)`](../{{ api_path }}/dataframe_methods/daft.DataFrame.explain.html)
 
 === "🐍 Python"
 
@@ -63,17 +65,17 @@ To interrogate the partitioning of your current DataFrame, you may use the [`df.
     |   ...
 ```
 
-In the above example, the call to [`df.read_parquet`](https://www.getdaft.io/projects/docs/en/stable/api_docs/doc_gen/io_functions/daft.read_parquet.html) read 100 Parquet files, but the Physical Plan indicates that Daft will only create 3 partitions. This is because these files are quite small (in this example, totalling about 72MB of data) and Daft recognizes that it should be able to read them as just 3 partitions, each with about 33 files each!
+In the above example, the call to [`df.read_parquet`](../{{ api_path }}/io_functions/daft.read_parquet.html) read 100 Parquet files, but the Physical Plan indicates that Daft will only create 3 partitions. This is because these files are quite small (in this example, totalling about 72MB of data) and Daft recognizes that it should be able to read them as just 3 partitions, each with about 33 files each!
 
 ## How can I change the way my data is partitioned?
 
 You can change the way your data is partitioned by leveraging certain DataFrame methods:
 
-1. [`daft.DataFrame.repartition`](https://www.getdaft.io/projects/docs/en/stable/api_docs/doc_gen/dataframe_methods/daft.DataFrame.repartition.html#daft.DataFrame.repartition): repartitions your data into `N` partitions by performing a hash-bucketing that ensure that all data with the same values for the specified columns ends up in the same partition. Expensive, requires data movement between partitions and machines.
+1. [`daft.DataFrame.repartition`](../{{ api_path }}/dataframe_methods/daft.DataFrame.repartition.html): repartitions your data into `N` partitions by performing a hash-bucketing that ensure that all data with the same values for the specified columns ends up in the same partition. Expensive, requires data movement between partitions and machines.
 
-2. [`daft.DataFrame.into_partitions`](https://www.getdaft.io/projects/docs/en/stable/api_docs/doc_gen/dataframe_methods/daft.DataFrame.into_partitions.html#daft.DataFrame.into_partitions): splits or coalesces adjacent partitions to meet the specified target number of total partitions. This is less expensive than a call to `df.repartition` because it does not require shuffling or moving data between partitions.
+2. [`daft.DataFrame.into_partitions`](../{{ api_path }}/dataframe_methods/daft.DataFrame.into_partitions.html): splits or coalesces adjacent partitions to meet the specified target number of total partitions. This is less expensive than a call to `df.repartition` because it does not require shuffling or moving data between partitions.
 
-3. Many global dataframe operations such as [`daft.DataFrame.join`](https://www.getdaft.io/projects/docs/en/stable/api_docs/doc_gen/dataframe_methods/daft.DataFrame.join.html#daft.DataFrame.join), [`daft.DataFrame.sort`](https://www.getdaft.io/projects/docs/en/stable/api_docs/doc_gen/dataframe_methods/daft.DataFrame.sort.html#daft.DataFrame.sort) and [`daft.GroupedDataframe.agg`](https://www.getdaft.io/projects/docs/en/stable/api_docs/groupby.html#daft.dataframe.GroupedDataFrame.agg) will change the partitioning of your data. This is because they require shuffling data between partitions to be globally correct.
+3. Many global dataframe operations such as [`daft.DataFrame.join`](../{{ api_path }}/dataframe_methods/daft.DataFrame.join.html), [`daft.DataFrame.sort`](../{{ api_path }}/dataframe_methods/daft.DataFrame.sort.html) and [`daft.GroupedDataframe.agg`](../api_docs/groupby.html#daft.dataframe.GroupedDataFrame.agg) will change the partitioning of your data. This is because they require shuffling data between partitions to be globally correct.
 
 Note that many of these methods will change both the *number of partitions* as well as the *clustering specification* of the new partitioning. For example, when calling `df.repartition(8, col("x"))`, the resultant dataframe will now have 8 partitions in total with the additional guarantee that all rows with the same value of `col("x")` are in the same partition! This is called "hash partitioning".