diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..866f9aa4 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,66 @@ +FROM python:3.11-slim AS base +ENV DEBIAN_FRONTEND=noninteractive +ENV ACCEPT_EULA=y +WORKDIR /opt/app-root/src/mostlyai/ + +FROM base AS deps + +RUN apt-get update -y \ + && apt-get install -y libaio1 curl gnupg unzip \ + # * PostgreSQL Connector Dependencies + && apt-get install -y libpq-dev gcc g++ \ + # * Kerberos Dependencies for Hive Connector + && apt-get install -y libkrb5-dev krb5-user \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +RUN VERSION_ID=$(grep VERSION_ID /etc/os-release | cut -d '"' -f 2) \ + && curl -sSL -o /tmp/packages-microsoft-prod.deb https://packages.microsoft.com/config/debian/$VERSION_ID/packages-microsoft-prod.deb \ + && dpkg -i /tmp/packages-microsoft-prod.deb \ + && apt-get update -y \ + && apt-get install -y unixodbc-dev msodbcsql18 mssql-tools \ + && apt-get clean \ + && rm -f /tmp/packages-microsoft-prod.deb \ + && rm -rf /var/lib/apt/lists/* +ENV PATH="/opt/mssql-tools/bin:$PATH" + +# * Oracle Connector Dependencies +RUN CURRENT_ARCH=$(uname -m | sed 's|x86_64|x64|g') \ + && if [ "$CURRENT_ARCH" = "x64" ]; then \ + curl https://download.oracle.com/otn_software/linux/instantclient/211000/instantclient-basic-linux.$CURRENT_ARCH-21.1.0.0.0.zip \ + -o /tmp/oracle-instantclient.zip \ + && curl https://download.oracle.com/otn_software/linux/instantclient/211000/instantclient-sqlplus-linux.$CURRENT_ARCH-21.1.0.0.0.zip \ + -o /tmp/oracle-sqlplus.zip \ + && unzip /tmp/oracle-instantclient.zip -d /opt/oracle \ + && unzip /tmp/oracle-sqlplus.zip -d /opt/oracle \ + && sh -c "echo '/opt/oracle/instantclient_21_1' > /etc/ld.so.conf.d/oracle-instantclient.conf" \ + && ldconfig \ + && rm -rf /tmp/* \ + ; fi + +ENV PATH="/opt/oracle/opt/oracle/instantclient_21_1:$PATH" +ENV LD_LIBRARY_PATH=/opt/oracle/instantclient_21_1 +ENV ORACLE_HOME=/opt/oracle/instantclient_21_1 + +FROM deps AS build +ENV UV_SYSTEM_PYTHON=1 +ENV UV_FROZEN=true +ENV UV_NO_CACHE=true +ENV COMMON_UV_ARGS="--no-dev --all-extras --no-extra local --no-extra local-gpu" + +COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ + +COPY ./uv.lock ./pyproject.toml ./ +RUN uv sync ${COMMON_UV_ARGS} \ + --no-install-project --no-install-package torch + +RUN uv sync ${COMMON_UV_ARGS} --no-install-project + +COPY mostlyai ./mostlyai +COPY README.md ./ +RUN uv sync ${COMMON_UV_ARGS} + +COPY ./tools/docker_entrypoint.py /opt/app-root/src/entrypoint.py + +EXPOSE 8080 +ENTRYPOINT [ "uv", "run", "--no-sync", "/opt/app-root/src/entrypoint.py" ] diff --git a/Makefile b/Makefile index a0bc4084..0f828035 100644 --- a/Makefile +++ b/Makefile @@ -40,6 +40,38 @@ COMMON_OPTIONS = \ clean: ## Remove .gitignore files git clean -fdX +# Variables for docker-run +HOST_PORT ?= 8080 +HOST_PATH ?= + +.PHONY: docker-build +docker-build: ## Build the docker image + DOCKER_BUILDKIT=1 docker build . --platform=linux/amd64 -t mostlyai/mostlyai + +.PHONY: docker-run +docker-run: ## Start the docker container + @echo "Mapping port: $(HOST_PORT) (host) <-> 8080 (container)" + @# here we have to make sure .venv folder is set as an anonymous volume, so that it will not be overwritten by a bind mount + @# ref: https://docs.astral.sh/uv/guides/integration/docker/#mounting-the-project-with-docker-run + @if [ -z "$(HOST_PATH)" ]; then \ + docker run --platform=linux/amd64 -it -p $(HOST_PORT):8080 \ + -v ~/.cache/huggingface:/opt/app-root/src/.cache/huggingface \ + mostlyai/mostlyai ; \ + else \ + if [ ! -d $(HOST_PATH) ]; then \ + echo "Failed to mount volume: $(HOST_PATH) does not exist"; \ + exit 1; \ + fi; \ + REAL_PATH=$$(realpath $(HOST_PATH)); \ + BASE_NAME=$$(basename $$REAL_PATH); \ + MOUNT_ARGS="--mount type=bind,source=$$REAL_PATH,target=/opt/app-root/src/$$BASE_NAME"; \ + echo "Mounting volume: $$REAL_PATH (host) <-> /opt/app-root/src/$$BASE_NAME (container)"; \ + docker run --platform=linux/amd64 --rm -it -p $(HOST_PORT):8080 \ + -v ~/.cache/huggingface:/opt/app-root/src/.cache/huggingface \ + -v /opt/app-root/src/mostlyai/.venv \ + $$MOUNT_ARGS mostlyai/mostlyai ; \ + fi; + # Default files to update PYPROJECT_TOML = pyproject.toml INIT_FILE = mostlyai/sdk/__init__.py diff --git a/README.md b/README.md index 55422629..398b65c0 100644 --- a/README.md +++ b/README.md @@ -30,38 +30,36 @@ The SDK allows you to programmatically create, browse and manage 3 key resources | Live probe the generator on demand | `df = mostly.probe(g, config)` | [mostly.probe](https://mostly-ai.github.io/mostlyai/api_client/#mostlyai.sdk.client.api.MostlyAI.probe) | | Connect to any data source within your org | `c = mostly.connect(config)` | [mostly.connect](https://mostly-ai.github.io/mostlyai/api_client/#mostlyai.sdk.client.api.MostlyAI.connect) | -https://github.com/user-attachments/assets/d1613636-06e4-4147-bef7-25bb4699e8fc - + ## Key Features - **Broad Data Support** - - Mixed-type data (categorical, numerical, geospatial, text, etc.) - - Single-table, multi-table, and time-series + - Mixed-type data (categorical, numerical, geospatial, text, etc.) + - Single-table, multi-table, and time-series - **Multiple Model Types** - - TabularARGN for SOTA tabular performance - - Fine-tune HuggingFace-based language models - - Efficient LSTM for text synthesis from scratch + - TabularARGN for SOTA tabular performance + - Fine-tune HuggingFace-based language models + - Efficient LSTM for text synthesis from scratch - **Advanced Training Options** - - GPU/CPU support - - Differential Privacy - - Progress Monitoring + - GPU/CPU support + - Differential Privacy + - Progress Monitoring - **Automated Quality Assurance** - - Quality metrics for fidelity and privacy - - In-depth HTML reports for visual analysis + - Quality metrics for fidelity and privacy + - In-depth HTML reports for visual analysis - **Flexible Sampling** - - Up-sample to any data volumes - - Conditional generation by any columns - - Re-balance underrepresented segments - - Context-aware data imputation - - Statistical fairness controls - - Rule-adherence via temperature + - Up-sample to any data volumes + - Conditional generation by any columns + - Re-balance underrepresented segments + - Context-aware data imputation + - Statistical fairness controls + - Rule-adherence via temperature - **Seamless Integration** - - Connect to external data sources (DBs, cloud storages) - - Fully permissive open-source license - + - Connect to external data sources (DBs, cloud storages) + - Fully permissive open-source license -## Quick Start Run on Colab +## Quick Start [![Open in Colab](https://img.shields.io/badge/Open%20in-Colab-blue?logo=google-colab)](https://colab.research.google.com/github/mostly-ai/mostlyai/blob/main/docs/tutorials/getting-started/getting-started.ipynb) Install the SDK via pip: @@ -91,14 +89,14 @@ g = mostly.train( "name": "census", "data": df_original, "tabular_model_configuration": { # tabular model configuration (optional) - "max_training_time": 1, # - limit training time (in minutes) - # model, max_epochs,,.. # further model configurations (optional) - # 'differential_privacy': { # differential privacy configuration (optional) - # 'max_epsilon': 5.0, # - max epsilon value, used as stopping criterion - # 'delta': 1e-5, # - delta value + "max_training_time": 1, # - limit training time (in minutes) + # model, max_epochs,,.. # further model configurations (optional) + # 'differential_privacy': { # differential privacy configuration (optional) + # 'max_epsilon': 5.0, # - max epsilon value, used as stopping criterion + # 'delta': 1e-5, # - delta value # } }, - # columns, keys, compute,.. # further table configurations (optional) + # columns, keys, compute,.. # further table configurations (optional) } ], }, @@ -144,7 +142,7 @@ df_samples Use `pip` (or better `uv pip`) to install the official `mostlyai` package via PyPI. Python 3.10 or higher is required. It is recommended to install the package within a dedicated virtual environment. -**CLIENT mode only** +### CLIENT mode only This is a light-weight installation for using the SDK in CLIENT mode only. It communicates to a MOSTLY AI platform to perform requested tasks. See e.g. [app.mostly.ai](https://app.mostly.ai/) for a free-to-use hosted version. @@ -152,7 +150,7 @@ This is a light-weight installation for using the SDK in CLIENT mode only. It co pip install -U mostlyai ``` -**CLIENT + LOCAL mode** +### CLIENT + LOCAL mode This is a full installation for using the SDK in both CLIENT and LOCAL mode. It includes all dependencies, incl. PyTorch, for training and generating synthetic data locally. @@ -166,6 +164,7 @@ pip install -U 'mostlyai[local-gpu]' ``` Add any of the following extras for further data connectors support in LOCAL mode: `databricks`, `googlebigquery`, `hive`, `mssql`, `mysql`, `oracle`, `postgres`, `snowflake`. E.g. + ```shell pip install -U 'mostlyai[local, databricks, snowflake]' ``` diff --git a/tools/docker_entrypoint.py b/tools/docker_entrypoint.py new file mode 100644 index 00000000..4526e5b8 --- /dev/null +++ b/tools/docker_entrypoint.py @@ -0,0 +1,58 @@ +# Copyright 2025 MOSTLY AI +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def main() -> None: + """ + Entrypoint for the MostlyAI SDK Docker image. + Can be called without any arguments which would start in a Local mode, running on port 8080. + Alternatively, any arguments can be passed as key-value pairs and they will be used when initiating the MostlyAI class. + + Example: + ```bash + docker run mostlyai/mostlyai + # Connected to http://127.0.0.1:8080 with 16 GB RAM, 11 CPUs, 0 GPUs available + + docker run mostlyai/mostlyai --local=True --local_port=8082 --ssl_verify=False + # Connected to http://127.0.0.1:8082 with 16 GB RAM, 11 CPUs, 0 GPUs available + ``` + """ + from argparse import ArgumentParser + from time import sleep + + parser = ArgumentParser(description="MostlyAI SDK Docker Entrypoint.") + _, args = parser.parse_known_args() + kwargs = {} + for arg in args: + if arg.startswith("--"): + key, value = arg.lstrip("--").split("=", 1) + kwargs[key] = value + if len(kwargs) == 0: + kwargs = {"local": True, "local_port": 8080} + + print("Startup may take a few seconds while libraries are being loaded...") + + from mostlyai.sdk import MostlyAI + + MostlyAI(**kwargs) + + try: + while True: + sleep(1) + except KeyboardInterrupt: + print("Shutting down...") + + +if __name__ == "__main__": + main()