Skip to content

Commit

Permalink
Merge pull request #879 from touma-I/alpha-1.0
Browse files Browse the repository at this point in the history
Transforms 1.0.0a0 refactored language transforms
  • Loading branch information
touma-I authored Dec 17, 2024
2 parents 9e1b281 + 6a43092 commit 03b7e09
Show file tree
Hide file tree
Showing 450 changed files with 4,851 additions and 12,005 deletions.
10 changes: 5 additions & 5 deletions .make.versions
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@ DPK_MAJOR_VERSION=0
# The minor version is incremented manually when significant features have been added that are backward compatible with the previous major.minor release.
DPK_MINOR_VERSION=2
# The minor version is incremented AUTOMATICALLY by the release.sh script when a new release is set.
DPK_MICRO_VERSION=3
DPK_MICRO_VERSION=4
# The suffix is generally always set in the main/development branch and only nulled out when creating release branches.
# It can be manually incremented, for example, to allow publishing a new intermediate version wheel to pypi.
DPK_VERSION_SUFFIX=
DPK_VERSION_SUFFIX=.dev0

DPK_VERSION=$(DPK_MAJOR_VERSION).$(DPK_MINOR_VERSION).$(DPK_MICRO_VERSION)$(DPK_VERSION_SUFFIX)

Expand All @@ -36,8 +36,8 @@ DPK_LIB_KFP_VERSION=$(DPK_VERSION)
DPK_LIB_KFP_VERSION_v2=$(DPK_VERSION)
DPK_LIB_KFP_SHARED=$(DPK_VERSION)

KFP_DOCKER_VERSION=$(DOCKER_IMAGE_VERSION)
KFP_DOCKER_VERSION_v2=$(DOCKER_IMAGE_VERSION)
KFP_DOCKER_VERSION=0.2.3
KFP_DOCKER_VERSION_v2=0.2.3

DPK_CONNECTOR_VERSION=0.2.4.dev0

Expand Down Expand Up @@ -66,4 +66,4 @@ endif
#
# If you change the versions numbers, be sure to run "make set-versions" to
# update version numbers across the transform (e.g., pyproject.toml).
TRANSFORMS_PKG_VERSION=0.2.3
TRANSFORMS_PKG_VERSION=1.0.0a0
2 changes: 1 addition & 1 deletion data-processing-lib/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "data_prep_toolkit"
version = "0.2.3"
version = "0.2.4.dev0"
keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ]
requires-python = ">=3.10,<3.13"
description = "Data Preparation Toolkit Library for Ray and Python"
Expand Down
2 changes: 1 addition & 1 deletion kfp/kfp_ray_components/createRayClusterComponent.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ inputs:

implementation:
container:
image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.3"
image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
# command is a list of strings (command-line arguments).
# The YAML language has two syntaxes for lists and you can use either of them.
# Here we use the "flow syntax" - comma-separated strings inside square brackets.
Expand Down
2 changes: 1 addition & 1 deletion kfp/kfp_ray_components/deleteRayClusterComponent.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ inputs:

implementation:
container:
image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.3"
image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
# command is a list of strings (command-line arguments).
# The YAML language has two syntaxes for lists and you can use either of them.
# Here we use the "flow syntax" - comma-separated strings inside square brackets.
Expand Down
2 changes: 1 addition & 1 deletion kfp/kfp_ray_components/executeRayJobComponent.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ inputs:

implementation:
container:
image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.3"
image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
# command is a list of strings (command-line arguments).
# The YAML language has two syntaxes for lists and you can use either of them.
# Here we use the "flow syntax" - comma-separated strings inside square brackets.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ inputs:

implementation:
container:
image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.3"
image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
# command is a list of strings (command-line arguments).
# The YAML language has two syntaxes for lists and you can use either of them.
# Here we use the "flow syntax" - comma-separated strings inside square brackets.
Expand Down
2 changes: 1 addition & 1 deletion kfp/kfp_ray_components/executeSubWorkflowComponent.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ outputs:

implementation:
container:
image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.3"
image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
# command is a list of strings (command-line arguments).
# The YAML language has two syntaxes for lists, and you can use either of them.
# Here we use the "flow syntax" - comma-separated strings inside square brackets.
Expand Down
4 changes: 2 additions & 2 deletions kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "data_prep_toolkit_kfp_v1"
version = "0.2.3"
version = "0.2.4.dev0"
requires-python = ">=3.10,<3.13"
description = "Data Preparation Kit Library. KFP support"
license = {text = "Apache-2.0"}
Expand All @@ -13,7 +13,7 @@ authors = [
]
dependencies = [
"kfp==1.8.22",
"data-prep-toolkit-kfp-shared==0.2.3",
"data-prep-toolkit-kfp-shared==0.2.4.dev0",
]

[build-system]
Expand Down
4 changes: 2 additions & 2 deletions kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "data_prep_toolkit_kfp_v2"
version = "0.2.3"
version = "0.2.4.dev0"
requires-python = ">=3.10,<3.13"
description = "Data Preparation Kit Library. KFP support"
license = {text = "Apache-2.0"}
Expand All @@ -14,7 +14,7 @@ authors = [
dependencies = [
"kfp==2.8.0",
"kfp-kubernetes==1.2.0",
"data-prep-toolkit-kfp-shared==0.2.3",
"data-prep-toolkit-kfp-shared==0.2.4.dev0",
]

[build-system]
Expand Down
4 changes: 2 additions & 2 deletions kfp/kfp_support_lib/shared_workflow_support/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "data_prep_toolkit_kfp_shared"
version = "0.2.3"
version = "0.2.4.dev0"
requires-python = ">=3.10,<3.13"
description = "Data Preparation Kit Library. KFP support"
license = {text = "Apache-2.0"}
Expand All @@ -14,7 +14,7 @@ authors = [
dependencies = [
"requests",
"kubernetes",
"data-prep-toolkit[ray]>=0.2.3",
"data-prep-toolkit[ray]>=0.2.4.dev0",
]

[build-system]
Expand Down
2 changes: 1 addition & 1 deletion scripts/check-workflows.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ if [ ! -d transforms ]; then
echo Please run this script from the top of the repository
exit 1
fi
KFP_BLACK_LIST="doc_chunk pdf2parquet pii_redactor text_encoder license_select repo_level_ordering header_cleanser fdedup"
KFP_BLACK_LIST="doc_chunk pdf2parquet pii_redactor text_encoder license_select repo_level_ordering header_cleanser fdedup hap"
while [ $# -ne 0 ]; do
case $1 in
-show-kfp-black-list) echo $KFP_BLACK_LIST; exit 0;
Expand Down
18 changes: 9 additions & 9 deletions scripts/k8s-setup/populate_minio.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,19 +30,19 @@ mc cp --recursive ${REPOROOT}/transforms/code/repo_level_ordering/ray/test-data/
mc cp --recursive ${REPOROOT}/transforms/code/license_select/ray/test-data/input/ kfp/test/license_select/input
mc cp --recursive ${REPOROOT}/transforms/code/license_select/ray/test-data/sample_approved_licenses.json kfp/test/license_select/
# language
mc cp --recursive ${REPOROOT}/transforms/language/lang_id/ray/test-data/input/ kfp/test/lang_id/input
mc cp --recursive ${REPOROOT}/transforms/language/doc_quality/ray/test-data/input/ kfp/test/doc_quality/input
mc cp --recursive ${REPOROOT}/transforms/language/pdf2parquet/ray/test-data/input/2206.01062.pdf kfp/test/pdf2parquet/input
mc cp --recursive ${REPOROOT}/transforms/language/text_encoder/ray/test-data/input/ kfp/test/text_encoder/input
mc cp --recursive ${REPOROOT}/transforms/language/doc_chunk/ray/test-data/input/ kfp/test/doc_chunk/input
mc cp --recursive ${REPOROOT}/transforms/language/html2parquet/ray/test-data/input/test1.html kfp/test/html2parquet/input
mc cp --recursive ${REPOROOT}/transforms/language/lang_id/test-data/input/ kfp/test/lang_id/input
mc cp --recursive ${REPOROOT}/transforms/language/doc_quality/test-data/input/ kfp/test/doc_quality/input
mc cp --recursive ${REPOROOT}/transforms/language/pdf2parquet/test-data/input/2206.01062.pdf kfp/test/pdf2parquet/input
mc cp --recursive ${REPOROOT}/transforms/language/text_encoder/test-data/input/ kfp/test/text_encoder/input
mc cp --recursive ${REPOROOT}/transforms/language/doc_chunk/test-data/input/ kfp/test/doc_chunk/input
mc cp --recursive ${REPOROOT}/transforms/language/html2parquet/test-data/input/test1.html kfp/test/html2parquet/input
# universal
mc cp --recursive ${REPOROOT}/transforms/universal/doc_id/ray/test-data/input/ kfp/test/doc_id/input
mc cp --recursive ${REPOROOT}/transforms/universal/doc_id/test-data/input/ kfp/test/doc_id/input
mc cp --recursive ${REPOROOT}/transforms/universal/ededup/ray/test-data/input/ kfp/test/ededup/input
mc cp --recursive ${REPOROOT}/transforms/universal/fdedup/ray/test-data/input/ kfp/test/fdedup/input
mc cp --recursive ${REPOROOT}/transforms/universal/filter/ray/test-data/input/ kfp/test/filter/input
mc cp --recursive ${REPOROOT}/transforms/universal/noop/ray/test-data/input/ kfp/test/noop/input
mc cp --recursive ${REPOROOT}/transforms/universal/tokenization/ray/test-data/ds01/input/ kfp/test/tokenization/ds01/input
mc cp --recursive ${REPOROOT}/transforms/universal/tokenization/test-data/ds01/input/ kfp/test/tokenization/ds01/input
mc cp --recursive ${REPOROOT}/transforms/universal/profiler/ray/test-data/input/ kfp/test/profiler/input
mc cp --recursive ${REPOROOT}/transforms/universal/resize/ray/test-data/input/ kfp/test/resize/input
mc cp --recursive ${REPOROOT}/transforms/universal/hap/ray/test-data/input/ kfp/test/hap/input
mc cp --recursive ${REPOROOT}/transforms/universal/hap/test-data/input/ kfp/test/hap/input
89 changes: 52 additions & 37 deletions transforms/.make.cicd.targets
Original file line number Diff line number Diff line change
Expand Up @@ -51,63 +51,78 @@ publish:

test-image-sequence:: .defaults.lib-whl-image .transforms.test-image-help .transforms.clean

test-image-python:
$(MAKE) BUILD_SPECIFIC_RUNTIME=python test-image

test-image-ray:
$(MAKE) BUILD_SPECIFIC_RUNTIME=ray test-image

test-image-spark:
$(MAKE) BUILD_SPECIFIC_RUNTIME=spark test-image

test-image:: .default.build-lib-wheel
@if [ -e Dockerfile.python ]; then \
$(MAKE) DOCKER_FILE=Dockerfile.python \
TRANSFORM_RUNTIME_SRC_FILE=$(TRANSFORM_PYTHON_SRC) \
DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-python \
test-image-sequence ; \
@if [ -z "$(BUILD_SPECIFIC_RUNTIME)" ] || [ "$(BUILD_SPECIFIC_RUNTIME)" == "python" ]; then \
if [ -e Dockerfile.python ]; then \
$(MAKE) DOCKER_FILE=Dockerfile.python \
TRANSFORM_RUNTIME_SRC_FILE=$(TRANSFORM_PYTHON_SRC) \
DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-python \
test-image-sequence ; \
fi ;\
fi
@if [ -e Dockerfile.ray ]; then \
$(MAKE) DOCKER_FILE=Dockerfile.ray \
TRANSFORM_RUNTIME_SRC_FILE=$(TRANSFORM_RAY_SRC) \
DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-ray \
BASE_IMAGE=$(RAY_BASE_IMAGE) \
test-image-sequence ; \
@if [ -z "$(BUILD_SPECIFIC_RUNTIME)" ] || [ "$(BUILD_SPECIFIC_RUNTIME)" == "ray" ]; then \
if [ -e Dockerfile.ray ]; then \
$(MAKE) DOCKER_FILE=Dockerfile.ray \
TRANSFORM_RUNTIME_SRC_FILE=$(TRANSFORM_RAY_SRC) \
DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-ray \
BASE_IMAGE=$(RAY_BASE_IMAGE) \
test-image-sequence ; \
fi ;\
fi
@if [ -e Dockerfile.spark ]; then \
$(MAKE) DOCKER_FILE=Dockerfile.spark \
TRANSFORM_RUNTIME_SRC_FILE=$(TRANSFORM_SPARK_SRC) \
DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-spark \
BASE_IMAGE=$(SPARK_BASE_IMAGE) \
test-image-sequence ; \
@if [ -z "$(BUILD_SPECIFIC_RUNTIME)" ] || [ "$(BUILD_SPECIFIC_RUNTIME)" == "spark" ]; then \
if [ -e Dockerfile.spark ]; then \
$(MAKE) DOCKER_FILE=Dockerfile.spark \
TRANSFORM_RUNTIME_SRC_FILE=$(TRANSFORM_SPARK_SRC) \
DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-spark \
BASE_IMAGE=$(SPARK_BASE_IMAGE) \
test-image-sequence ; \
fi ;\
fi
-rm -rf data-processing-dist


image-python:
@if [ -e Dockerfile.python ]; then \
$(MAKE) DOCKER_FILE=Dockerfile.python \
DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-python \
.defaults.lib-whl-image ; \
fi
$(MAKE) BUILD_SPECIFIC_RUNTIME=python image

image-ray:
@if [ -e Dockerfile.ray ]; then \
$(MAKE) DOCKER_FILE=Dockerfile.ray \
DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-ray \
BASE_IMAGE=$(RAY_BASE_IMAGE) \
.defaults.lib-whl-image ; \
fi
$(MAKE) BUILD_SPECIFIC_RUNTIME=ray image

image-spark:
@if [ -e Dockerfile.spark ]; then \
$(MAKE) DOCKER_FILE=Dockerfile.spark \
DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-spark \
BASE_IMAGE=$(SPARK_BASE_IMAGE) \
.defaults.lib-whl-image ; \
fi
$(MAKE) BUILD_SPECIFIC_RUNTIME=spark image

image:: .default.build-lib-wheel
## Build all possible images unless a specific runtime is specified
@if [ -z "$(BUILD_SPECIFIC_RUNTIME)" ] || [ "$(BUILD_SPECIFIC_RUNTIME)" == "python" ]; then \
$(MAKE) image-python ; \
if [ -e Dockerfile.python ]; then \
$(MAKE) DOCKER_FILE=Dockerfile.python \
DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-python \
.defaults.lib-whl-image ; \
fi ; \
fi
@if [ -z "$(BUILD_SPECIFIC_RUNTIME)" ] || [ "$(BUILD_SPECIFIC_RUNTIME)" == "ray" ]; then \
$(MAKE) image-ray ; \
if [ -e Dockerfile.ray ]; then \
$(MAKE) DOCKER_FILE=Dockerfile.ray \
DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-ray \
BASE_IMAGE=$(RAY_BASE_IMAGE) \
.defaults.lib-whl-image ; \
fi ; \
fi
@if [ -z "$(BUILD_SPECIFIC_RUNTIME)" ] || [ "$(BUILD_SPECIFIC_RUNTIME)" == "spark" ]; then \
$(MAKE) image-spark ; \
if [ -e Dockerfile.spark ]; then \
$(MAKE) DOCKER_FILE=Dockerfile.spark \
DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-spark \
BASE_IMAGE=$(SPARK_BASE_IMAGE) \
.defaults.lib-whl-image ; \
fi ; \
fi
-rm -rf data-processing-dist

Expand Down
16 changes: 16 additions & 0 deletions transforms/Makefile.transform.template
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
REPOROOT=../../..
# Use make help, to see the available rules
include $(REPOROOT)/transforms/.make.cicd.targets

#
# This is intended to be included across the Makefiles provided within
# a given transform's directory tree, so must use compatible syntax.
#
################################################################################
# This defines the name of the transform and is used to match against
# expected files and is used to define the transform's image name.
TRANSFORM_NAME=$(shell basename `pwd`)

################################################################################


2 changes: 1 addition & 1 deletion transforms/code/code2parquet/kfp_ray/code2parquet_wf.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@


# components
base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.3"
base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"

# path to kfp component specifications files
component_spec_path = "../../../../kfp/kfp_ray_components/"
Expand Down
2 changes: 1 addition & 1 deletion transforms/code/code2parquet/python/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "dpk_code2parquet_transform_python"
version = "0.2.3"
version = "0.2.4.dev0"
requires-python = ">=3.10,<3.13"
description = "code2parquet Python Transform"
license = {text = "Apache-2.0"}
Expand Down
6 changes: 3 additions & 3 deletions transforms/code/code2parquet/ray/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "dpk_code2parquet_transform_ray"
version = "0.2.3"
version = "0.2.4.dev0"
requires-python = ">=3.10,<3.13"
description = "code2parquet Ray Transform"
license = {text = "Apache-2.0"}
Expand All @@ -10,8 +10,8 @@ authors = [
{ name = "Boris Lublinsky", email = "[email protected]" },
]
dependencies = [
"data-prep-toolkit[ray]>=0.2.3",
"dpk-code2parquet-transform-python==0.2.3",
"data-prep-toolkit[ray]>=0.2.4.dev0",
"dpk-code2parquet-transform-python==0.2.4.dev0",
"parameterized",
"pandas",
]
Expand Down
2 changes: 1 addition & 1 deletion transforms/code/code_profiler/python/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "dpk_code_profiler_transform_python"
version = "0.2.3"
version = "0.2.4.dev0"
requires-python = ">=3.10,<3.13"
description = "Code Profiler Python Transform"
license = {text = "Apache-2.0"}
Expand Down
6 changes: 3 additions & 3 deletions transforms/code/code_profiler/ray/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "dpk_code_profiler_transform_ray"
version = "0.2.3"
version = "0.2.4.dev0"
requires-python = ">=3.10,<3.13"
description = "Code Profiler Ray Transform"
license = {text = "Apache-2.0"}
Expand All @@ -9,8 +9,8 @@ authors = [
{ name = "Pankaj Thorat", email = "[email protected]" },
]
dependencies = [
"dpk-code-profiler-transform-python==0.2.3",
"data-prep-toolkit[ray]>=0.2.3",
"dpk-code-profiler-transform-python==0.2.4.dev0",
"data-prep-toolkit[ray]>=0.2.4.dev0",
]

[build-system]
Expand Down
2 changes: 1 addition & 1 deletion transforms/code/code_quality/kfp_ray/code_quality_wf.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
task_image = "quay.io/dataprep1/data-prep-kit/code_quality-ray:latest"

# components
base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.3"
base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"

# path to kfp component specifications files
component_spec_path = "../../../../kfp/kfp_ray_components/"
Expand Down
2 changes: 1 addition & 1 deletion transforms/code/code_quality/python/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "dpk_code_quality_transform_python"
version = "0.2.3"
version = "0.2.4.dev0"
requires-python = ">=3.10,<3.13"
description = "Code Quality Python Transform"
license = {text = "Apache-2.0"}
Expand Down
Loading

0 comments on commit 03b7e09

Please sign in to comment.