diff --git a/transforms/code/code2parquet/python/requirements.txt b/transforms/code/code2parquet/python/requirements.txt index 7bb603ee4..d871e3142 100644 --- a/transforms/code/code2parquet/python/requirements.txt +++ b/transforms/code/code2parquet/python/requirements.txt @@ -1,3 +1,3 @@ -data-prep-toolkit>=0.2.4.dev0 +data-prep-toolkit>=0.2.3 parameterized pandas diff --git a/transforms/code/code_profiler/python/requirements.txt b/transforms/code/code_profiler/python/requirements.txt index 761c30ef2..ee5c4e7f2 100644 --- a/transforms/code/code_profiler/python/requirements.txt +++ b/transforms/code/code_profiler/python/requirements.txt @@ -1,4 +1,4 @@ -data-prep-toolkit>=0.2.4.dev0 +data-prep-toolkit>=0.2.3 parameterized pandas aiolimiter==1.1.0 diff --git a/transforms/code/code_quality/python/requirements.txt b/transforms/code/code_quality/python/requirements.txt index d9b3b8799..4226758bd 100644 --- a/transforms/code/code_quality/python/requirements.txt +++ b/transforms/code/code_quality/python/requirements.txt @@ -1,3 +1,3 @@ -data-prep-toolkit>=0.2.4.dev0 +data-prep-toolkit>=0.2.3 bs4==0.0.2 transformers==4.38.2 diff --git a/transforms/code/header_cleanser/python/requirements.txt b/transforms/code/header_cleanser/python/requirements.txt index 21390b8b1..da9d63534 100644 --- a/transforms/code/header_cleanser/python/requirements.txt +++ b/transforms/code/header_cleanser/python/requirements.txt @@ -1,3 +1,3 @@ -data-prep-toolkit>=0.2.4.dev0 +data-prep-toolkit>=0.2.3 scancode-toolkit==32.1.0 ; platform_system != 'Darwin' diff --git a/transforms/code/license_select/python/requirements.txt b/transforms/code/license_select/python/requirements.txt index 266f44c36..013ce9011 100644 --- a/transforms/code/license_select/python/requirements.txt +++ b/transforms/code/license_select/python/requirements.txt @@ -1 +1 @@ -data-prep-toolkit>=0.2.4.dev0 \ No newline at end of file +data-prep-toolkit>=0.2.3 \ No newline at end of file diff --git a/transforms/code/proglang_select/python/requirements.txt b/transforms/code/proglang_select/python/requirements.txt index 266f44c36..013ce9011 100644 --- a/transforms/code/proglang_select/python/requirements.txt +++ b/transforms/code/proglang_select/python/requirements.txt @@ -1 +1 @@ -data-prep-toolkit>=0.2.4.dev0 \ No newline at end of file +data-prep-toolkit>=0.2.3 \ No newline at end of file diff --git a/transforms/language/doc_chunk/requirements.txt b/transforms/language/doc_chunk/requirements.txt index 4497d92f9..c24f0113b 100644 --- a/transforms/language/doc_chunk/requirements.txt +++ b/transforms/language/doc_chunk/requirements.txt @@ -1,4 +1,3 @@ -data-prep-toolkit>=0.2.3 docling-core==2.3.0 pydantic>=2.0.0,<2.10.0 llama-index-core>=0.11.22,<0.12.0 diff --git a/transforms/language/doc_quality/requirements.txt b/transforms/language/doc_quality/requirements.txt index b02a19d46..8b1378917 100644 --- a/transforms/language/doc_quality/requirements.txt +++ b/transforms/language/doc_quality/requirements.txt @@ -1,2 +1 @@ -data-prep-toolkit>=0.2.3 diff --git a/transforms/language/html2parquet/requirements.txt b/transforms/language/html2parquet/requirements.txt index e5dafa64b..40b13d614 100644 --- a/transforms/language/html2parquet/requirements.txt +++ b/transforms/language/html2parquet/requirements.txt @@ -1,2 +1 @@ -data-prep-toolkit>=0.2.3 trafilatura==1.12.0 diff --git a/transforms/language/lang_id/requirements.txt b/transforms/language/lang_id/requirements.txt index ea00fe33e..6d0647329 100644 --- a/transforms/language/lang_id/requirements.txt +++ b/transforms/language/lang_id/requirements.txt @@ -1,4 +1,3 @@ -data-prep-toolkit>=0.2.3 fasttext==0.9.2 langcodes==3.3.0 huggingface-hub >= 0.21.4, <1.0.0 diff --git a/transforms/language/pii_redactor/python/requirements.txt b/transforms/language/pii_redactor/python/requirements.txt index 08d525a55..e33a8c166 100644 --- a/transforms/language/pii_redactor/python/requirements.txt +++ b/transforms/language/pii_redactor/python/requirements.txt @@ -1,4 +1,4 @@ -data-prep-toolkit>=0.2.4.dev9 +data-prep-toolkit>=0.2.3 presidio-analyzer>=2.2.355 presidio-anonymizer>=2.2.355 flair>=0.14.0 diff --git a/transforms/language/text_encoder/requirements.txt b/transforms/language/text_encoder/requirements.txt index eb3813354..286f87d76 100644 --- a/transforms/language/text_encoder/requirements.txt +++ b/transforms/language/text_encoder/requirements.txt @@ -1,2 +1 @@ -data-prep-toolkit>=0.2.3 sentence-transformers==3.0.1 diff --git a/transforms/pyproject.toml b/transforms/pyproject.toml index 431de9983..34e24acef 100644 --- a/transforms/pyproject.toml +++ b/transforms/pyproject.toml @@ -29,26 +29,55 @@ all = { file = [ "code/code_quality/python/requirements.txt", "code/code2parquet/python/requirements.txt", -"language/doc_quality/python/requirements.txt", +##### pii_redactor seem to be failing UT +## "language/pii_redactor/python/requirements.txt", + +"universal/fdedup/python/requirements.txt", +"universal/profiler/python/requirements.txt", +"universal/filter/python/requirements.txt", +"universal/resize/python/requirements.txt", +"universal/tokenization/python/requirements.txt", + +"language/doc_quality/requirements.txt", "language/doc_chunk/requirements.txt", ##### Cannot have html2parquet until we solve ## docling-ibm-models 1.1.7 depends on lxml<5.0.0 and >=4.9.1 ## trafilatura 1.12.0 depends on lxml>=5.2.2; platform_system != "Darwin" or python_version > "3.8" -## "language/html2parquet/python/requirements.txt", +## "language/html2parquet/requirements.txt", +"language/lang_id/requirements.txt", +"language/text_encoder/requirements.txt", +"language/pdf2parquet/requirements.txt", + +"universal/doc_id/requirements.txt", +"universal/ededup/requirements.txt", +"universal/hap/requirements.txt", +"universal/web2parquet/requirements.txt" +]} + +language = { file = [ ##### pii_redactor seem to be failing UT ## "language/pii_redactor/python/requirements.txt", "language/lang_id/python/requirements.txt", "language/text_encoder/requirements.txt", -"language/pdf2parquet/python/requirements.txt", "universal/hap/python/requirements.txt", "universal/tokenization/python/requirements.txt", "universal/ededup/python/requirements.txt", "universal/fdedup/python/requirements.txt", -"universal/profiler/python/requirements.txt", -"universal/doc_id/python/requirements.txt", -"universal/filter/python/requirements.txt", -"universal/resize/python/requirements.txt", + +"language/doc_quality/requirements.txt", +"language/doc_chunk/requirements.txt", +##### Cannot have html2parquet until we solve +## docling-ibm-models 1.1.7 depends on lxml<5.0.0 and >=4.9.1 +## trafilatura 1.12.0 depends on lxml>=5.2.2; platform_system != "Darwin" or python_version > "3.8" +## "language/html2parquet/requirements.txt", +"language/lang_id/requirements.txt", +"language/text_encoder/requirements.txt", +"language/pdf2parquet/requirements.txt", + +"universal/doc_id/requirements.txt", +"universal/ededup/requirements.txt", +"universal/hap/requirements.txt", "universal/web2parquet/requirements.txt" ]} @@ -62,45 +91,52 @@ code_quality = { file = ["code/code_quality/python/requirements.txt"]} code2parquet = {file = ["code/code2parquet/python/requirements.txt"]} code_profiler = { file = ["code/code_profiler/python/requirements.txt"]} -doc_quality = { file = ["language/doc_quality/python/requirements.txt"]} -doc_chunk = { file = ["language/doc_chunk/requirements.txt"]} -html2parquet = { file = ["language/html2parquet/python/requirements.txt"]} pii_redactor = { file = ["language/pii_redactor/python/requirements.txt"]} -lang_id = { file = ["language/lang_id/python/requirements.txt"]} -text_encoder = { file = ["language/text_encoder/requirements.txt"]} -pdf2parquet = { file = ["language/pdf2parquet/python/requirements.txt"]} -hap = { file = ["universal/hap/python/requirements.txt"]} -tokenization = { file = ["universal/tokenization/python/requirements.txt"]} ededup = { file = ["universal/ededup/python/requirements.txt"]} fdedup = { file = ["universal/fdedup/python/requirements.txt"]} profiler = { file = ["universal/profiler/python/requirements.txt"]} -doc_id = { file = ["universal/doc_id/python/requirements.txt"]} filter = { file = ["universal/filter/python/requirements.txt"]} resize = { file = ["universal/resize/python/requirements.txt"]} +tokenization = { file = ["universal/tokenization/python/requirements.txt"]} + +######## Named transforms +doc_chunk = { file = ["language/doc_chunk/requirements.txt"]} +doc_quality = { file = ["language/doc_quality/requirements.txt"]} +html2parquet = { file = ["language/html2parquet/requirements.txt"]} +lang_id = { file = ["language/lang_id/requirements.txt"]} +pdf2parquet = { file = ["language/pdf2parquet/requirements.txt"]} +text_encoder = { file = ["language/text_encoder/requirements.txt"]} + +doc_id = { file = ["universal/doc_id/requirements.txt"]} +hap = { file = ["universal/hap/requirements.txt"]} web2parquet = { file = ["universal/web2parquet/requirements.txt"]} # Does not seem to work for our custom layout # copy all files to a single src and let automatic discovery find them -[tool.setuptools.package-data] -"*" = ["*.txt"] - -# To include this, comment out the package.find section, -# uncomment the package-dir section and rerun the build -# while keeping the build folder from previous run -#[tool.setuptools.packages.find] -#where = ["src"] - -# To include this, comment out the package.find section, -# uncomment the package-dir section and rerun the build -# while keeping the build folder from previous run -#[tool.setuptools.package-dir] -#dpk_pdf2parquet = "language/html2parquet/dpk_pdf2parquet" -#dpk_doc_chunck = "universal/doc_chunck/dpk_doc_chunk" -#dpk_text_encoder = "language/text_encoder/dpk_text_encoder" -#dpk_html2parquet = "language/html2parquet/dpk_html2parquet" -#dpk_web2parquet = "universal/web2parquet/dpk_web2parquet" +# When combing named modules with flat modules, need to run +# the build twice, once with the block below commented out +# and once after adding the lines below +[tool.setuptools.package-dir] +dpk_web2parquet = "universal/web2parquet/dpk_web2parquet" +dpk_doc_chunk = "language/doc_chunk/dpk_doc_chunk" +dpk_doc_quality = "language/doc_quality/dpk_doc_quality" +dpk_html2parquet = "language/html2parquet/dpk_html2parquet" +dpk_lang_id = "language/lang_id/dpk_lang_id" +dpk_pdf2parquet = "language/pdf2parquet/dpk_pdf2parquet" +dpk_text_encoder = "language/text_encoder/dpk_text_encoder" +dpk_doc_id = "universal/doc_id/dpk_doc_id" +dpk_hap = "universal/hap/dpk_hap" + +#[tool.setuptools.package-data] +#"*" = ["*.txt"] + +[options] +package_dir = ["src","test"] + +[options.packages.find] +where = ["src"] [tool.pytest.ini_options] # Currently we use low coverage since we have to run tests separately (see makefile) diff --git a/transforms/requirements-ray.txt b/transforms/requirements-ray.txt index 225a73b7e..517d039d4 100644 --- a/transforms/requirements-ray.txt +++ b/transforms/requirements-ray.txt @@ -1,4 +1,4 @@ -data-prep-toolkit[ray]>=0.2.4.dev0 +data-prep-toolkit[ray]>=0.2.3 networkx==3.3 colorlog==6.8.2 func-timeout==4.3.5 diff --git a/transforms/requirements.txt b/transforms/requirements.txt index 1099df032..fc37b9fc0 100644 --- a/transforms/requirements.txt +++ b/transforms/requirements.txt @@ -1 +1 @@ -data-prep-toolkit>=0.2.4.dev0 +data-prep-toolkit>=0.2.3 diff --git a/transforms/transforms-1.0-lang.ipynb b/transforms/transforms-1.0-lang.ipynb index 2545eec07..38e684415 100644 --- a/transforms/transforms-1.0-lang.ipynb +++ b/transforms/transforms-1.0-lang.ipynb @@ -161,62 +161,6 @@ "#table.to_pandas()" ] }, - { - "cell_type": "code", - "execution_count": 10, - "id": "38480cd5", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "10:56:59 INFO - exact dedup params are {'doc_column': 'contents', 'doc_id_column': 'document_id', 'use_snapshot': False, 'snapshot_directory': None}\n", - "10:56:59 INFO - pipeline id pipeline_id\n", - "10:56:59 INFO - code location None\n", - "10:56:59 INFO - data factory data_ is using local data access: input_folder - doc-chunk-files output_folder - dedup-files\n", - "10:56:59 INFO - data factory data_ max_files -1, n_sample -1\n", - "10:56:59 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "10:56:59 INFO - orchestrator ededup started at 2024-12-14 10:56:59\n", - "10:56:59 INFO - Number of files is 1, source profile {'max_file_size': 0.03043651580810547, 'min_file_size': 0.03043651580810547, 'total_file_size': 0.03043651580810547}\n", - "10:56:59 INFO - Starting from the beginning\n", - "10:56:59 INFO - Completed 1 files (100.0%) in 0.0 min\n", - "10:56:59 INFO - Done processing 1 files, waiting for flush() completion.\n", - "10:56:59 INFO - done flushing in 0.0 sec\n", - "10:56:59 INFO - Completed execution in 0.0 min, execution result 0\n" - ] - }, - { - "data": { - "text/plain": [ - "0" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from dpk_ededup.transform_python import Ededup\n", - "Ededup(input_folder=\"doc-chunk-files\",\n", - " output_folder=\"dedup-files\",\n", - " ededup_doc_column=\"contents\",\n", - " ededup_doc_id_column=\"document_id\").transform()" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "27e36a8e", - "metadata": {}, - "outputs": [], - "source": [ - "##### **** To explote the output from eDedup, run the code below\n", - "#table = pq.read_table('dedup-files/arxiv_org_2408.09869v5.pdf_application.parquet')\n", - "#table.to_pandas()" - ] - }, { "cell_type": "code", "execution_count": 12, @@ -255,7 +199,7 @@ ], "source": [ "from dpk_lang_id.transform_python import LangId\n", - "LangId(input_folder= \"dedup-files\",\n", + "LangId(input_folder= \"doc-chunk-files\",\n", " output_folder= \"langId-files\",\n", " lang_id_model_credential= \"PUT YOUR OWN HUGGINGFACE CREDENTIAL\",\n", " lang_id_model_kind= \"fasttext\",\n", @@ -302,7 +246,7 @@ "source": [ "%%capture\n", "from dpk_doc_quality.transform_python import DocQuality\n", - "DocQuality(input_folder='dedup-files',\n", + "DocQuality(input_folder='doc-chunk-files',\n", " output_folder= 'doc-quality-files',\n", " docq_text_lang = \"en\",\n", " docq_doc_content_column =\"contents\").transform()" diff --git a/transforms/universal/doc_id/requirements.txt b/transforms/universal/doc_id/requirements.txt index 013ce9011..e69de29bb 100644 --- a/transforms/universal/doc_id/requirements.txt +++ b/transforms/universal/doc_id/requirements.txt @@ -1 +0,0 @@ -data-prep-toolkit>=0.2.3 \ No newline at end of file diff --git a/transforms/universal/ededup/python/requirements.txt b/transforms/universal/ededup/python/requirements.txt index 9c0ba65ba..99fe74aad 100644 --- a/transforms/universal/ededup/python/requirements.txt +++ b/transforms/universal/ededup/python/requirements.txt @@ -1,3 +1,3 @@ -data-prep-toolkit>=0.2.4.dev0 +data-prep-toolkit>=0.2.3 mmh3>=4.1.0 xxhash==3.4.1 diff --git a/transforms/universal/fdedup/python/requirements.txt b/transforms/universal/fdedup/python/requirements.txt index efc3769cc..3d91e5ba4 100644 --- a/transforms/universal/fdedup/python/requirements.txt +++ b/transforms/universal/fdedup/python/requirements.txt @@ -1,4 +1,4 @@ -data-prep-toolkit>=0.2.4.dev0 +data-prep-toolkit>=0.2.3 pyyaml>=6.0.2 boto3>=1.34.69 kubernetes>=30.1.0 diff --git a/transforms/universal/fdedup/ray/requirements.txt b/transforms/universal/fdedup/ray/requirements.txt index bac39b169..782ef76e2 100644 --- a/transforms/universal/fdedup/ray/requirements.txt +++ b/transforms/universal/fdedup/ray/requirements.txt @@ -1,4 +1,4 @@ -data-prep-toolkit[ray]>=0.2.4.dev0 +data-prep-toolkit[ray]>=0.2.3 dpk_fdedup_transform_python==0.2.4.dev0 mmh3>=4.1.0 xxhash==3.4.1 diff --git a/transforms/universal/filter/python/requirements.txt b/transforms/universal/filter/python/requirements.txt index da62c43ce..c298deafd 100644 --- a/transforms/universal/filter/python/requirements.txt +++ b/transforms/universal/filter/python/requirements.txt @@ -1,3 +1,3 @@ -data-prep-toolkit>=0.2.4.dev0 +data-prep-toolkit>=0.2.3 duckdb>=0.10.1 diff --git a/transforms/universal/hap/requirements.txt b/transforms/universal/hap/requirements.txt index b40c1d94e..079767b7a 100644 --- a/transforms/universal/hap/requirements.txt +++ b/transforms/universal/hap/requirements.txt @@ -1,4 +1,3 @@ -data-prep-toolkit>=0.2.3 nltk==3.9.1 transformers==4.38.2 torch>=2.2.2,<=2.4.1 diff --git a/transforms/universal/profiler/python/requirements.txt b/transforms/universal/profiler/python/requirements.txt index d89c81dfa..c9c874ffe 100644 --- a/transforms/universal/profiler/python/requirements.txt +++ b/transforms/universal/profiler/python/requirements.txt @@ -1,5 +1,5 @@ -data-prep-toolkit>=0.2.4.dev0 +data-prep-toolkit>=0.2.3 mmh3==4.1.0 xxhash==3.4.1 diff --git a/transforms/universal/resize/python/requirements.txt b/transforms/universal/resize/python/requirements.txt index 266f44c36..013ce9011 100644 --- a/transforms/universal/resize/python/requirements.txt +++ b/transforms/universal/resize/python/requirements.txt @@ -1 +1 @@ -data-prep-toolkit>=0.2.4.dev0 \ No newline at end of file +data-prep-toolkit>=0.2.3 \ No newline at end of file diff --git a/transforms/universal/tokenization/python/requirements.txt b/transforms/universal/tokenization/python/requirements.txt index 0da904f59..56e81f87c 100644 --- a/transforms/universal/tokenization/python/requirements.txt +++ b/transforms/universal/tokenization/python/requirements.txt @@ -1,2 +1,2 @@ -data-prep-toolkit>=0.2.4.dev0 +data-prep-toolkit>=0.2.3 transformers==4.38.2 diff --git a/transforms/universal/web2parquet/requirements.txt b/transforms/universal/web2parquet/requirements.txt index be7eaacca..abae38b7a 100644 --- a/transforms/universal/web2parquet/requirements.txt +++ b/transforms/universal/web2parquet/requirements.txt @@ -1,2 +1 @@ -data-prep-toolkit>=0.2.3 data_prep_connector>=0.2.3 \ No newline at end of file