Skip to content

Commit

Permalink
update dependencies
Browse files Browse the repository at this point in the history
Signed-off-by: Maroun Touma <[email protected]>
  • Loading branch information
touma-I committed Dec 17, 2024
1 parent 9c2e3a4 commit 6a43092
Show file tree
Hide file tree
Showing 26 changed files with 88 additions and 116 deletions.
2 changes: 1 addition & 1 deletion transforms/code/code2parquet/python/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
data-prep-toolkit>=0.2.4.dev0
data-prep-toolkit>=0.2.3
parameterized
pandas
2 changes: 1 addition & 1 deletion transforms/code/code_profiler/python/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
data-prep-toolkit>=0.2.4.dev0
data-prep-toolkit>=0.2.3
parameterized
pandas
aiolimiter==1.1.0
Expand Down
2 changes: 1 addition & 1 deletion transforms/code/code_quality/python/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
data-prep-toolkit>=0.2.4.dev0
data-prep-toolkit>=0.2.3
bs4==0.0.2
transformers==4.38.2
2 changes: 1 addition & 1 deletion transforms/code/header_cleanser/python/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
data-prep-toolkit>=0.2.4.dev0
data-prep-toolkit>=0.2.3
scancode-toolkit==32.1.0 ; platform_system != 'Darwin'

2 changes: 1 addition & 1 deletion transforms/code/license_select/python/requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
data-prep-toolkit>=0.2.4.dev0
data-prep-toolkit>=0.2.3
2 changes: 1 addition & 1 deletion transforms/code/proglang_select/python/requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
data-prep-toolkit>=0.2.4.dev0
data-prep-toolkit>=0.2.3
1 change: 0 additions & 1 deletion transforms/language/doc_chunk/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
data-prep-toolkit>=0.2.3
docling-core==2.3.0
pydantic>=2.0.0,<2.10.0
llama-index-core>=0.11.22,<0.12.0
1 change: 0 additions & 1 deletion transforms/language/doc_quality/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1 @@

data-prep-toolkit>=0.2.3
1 change: 0 additions & 1 deletion transforms/language/html2parquet/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1 @@
data-prep-toolkit>=0.2.3
trafilatura==1.12.0
1 change: 0 additions & 1 deletion transforms/language/lang_id/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
data-prep-toolkit>=0.2.3
fasttext==0.9.2
langcodes==3.3.0
huggingface-hub >= 0.21.4, <1.0.0
Expand Down
2 changes: 1 addition & 1 deletion transforms/language/pii_redactor/python/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
data-prep-toolkit>=0.2.4.dev9
data-prep-toolkit>=0.2.3
presidio-analyzer>=2.2.355
presidio-anonymizer>=2.2.355
flair>=0.14.0
Expand Down
1 change: 0 additions & 1 deletion transforms/language/text_encoder/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1 @@
data-prep-toolkit>=0.2.3
sentence-transformers==3.0.1
104 changes: 70 additions & 34 deletions transforms/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,26 +29,55 @@ all = { file = [
"code/code_quality/python/requirements.txt",
"code/code2parquet/python/requirements.txt",

"language/doc_quality/python/requirements.txt",
##### pii_redactor seem to be failing UT
## "language/pii_redactor/python/requirements.txt",

"universal/fdedup/python/requirements.txt",
"universal/profiler/python/requirements.txt",
"universal/filter/python/requirements.txt",
"universal/resize/python/requirements.txt",
"universal/tokenization/python/requirements.txt",

"language/doc_quality/requirements.txt",
"language/doc_chunk/requirements.txt",
##### Cannot have html2parquet until we solve
## docling-ibm-models 1.1.7 depends on lxml<5.0.0 and >=4.9.1
## trafilatura 1.12.0 depends on lxml>=5.2.2; platform_system != "Darwin" or python_version > "3.8"
## "language/html2parquet/python/requirements.txt",
## "language/html2parquet/requirements.txt",
"language/lang_id/requirements.txt",
"language/text_encoder/requirements.txt",
"language/pdf2parquet/requirements.txt",

"universal/doc_id/requirements.txt",
"universal/ededup/requirements.txt",
"universal/hap/requirements.txt",
"universal/web2parquet/requirements.txt"
]}

language = { file = [
##### pii_redactor seem to be failing UT
## "language/pii_redactor/python/requirements.txt",
"language/lang_id/python/requirements.txt",
"language/text_encoder/requirements.txt",
"language/pdf2parquet/python/requirements.txt",

"universal/hap/python/requirements.txt",
"universal/tokenization/python/requirements.txt",
"universal/ededup/python/requirements.txt",
"universal/fdedup/python/requirements.txt",
"universal/profiler/python/requirements.txt",
"universal/doc_id/python/requirements.txt",
"universal/filter/python/requirements.txt",
"universal/resize/python/requirements.txt",

"language/doc_quality/requirements.txt",
"language/doc_chunk/requirements.txt",
##### Cannot have html2parquet until we solve
## docling-ibm-models 1.1.7 depends on lxml<5.0.0 and >=4.9.1
## trafilatura 1.12.0 depends on lxml>=5.2.2; platform_system != "Darwin" or python_version > "3.8"
## "language/html2parquet/requirements.txt",
"language/lang_id/requirements.txt",
"language/text_encoder/requirements.txt",
"language/pdf2parquet/requirements.txt",

"universal/doc_id/requirements.txt",
"universal/ededup/requirements.txt",
"universal/hap/requirements.txt",
"universal/web2parquet/requirements.txt"
]}

Expand All @@ -62,45 +91,52 @@ code_quality = { file = ["code/code_quality/python/requirements.txt"]}
code2parquet = {file = ["code/code2parquet/python/requirements.txt"]}
code_profiler = { file = ["code/code_profiler/python/requirements.txt"]}

doc_quality = { file = ["language/doc_quality/python/requirements.txt"]}
doc_chunk = { file = ["language/doc_chunk/requirements.txt"]}
html2parquet = { file = ["language/html2parquet/python/requirements.txt"]}
pii_redactor = { file = ["language/pii_redactor/python/requirements.txt"]}
lang_id = { file = ["language/lang_id/python/requirements.txt"]}
text_encoder = { file = ["language/text_encoder/requirements.txt"]}
pdf2parquet = { file = ["language/pdf2parquet/python/requirements.txt"]}

hap = { file = ["universal/hap/python/requirements.txt"]}
tokenization = { file = ["universal/tokenization/python/requirements.txt"]}
ededup = { file = ["universal/ededup/python/requirements.txt"]}
fdedup = { file = ["universal/fdedup/python/requirements.txt"]}
profiler = { file = ["universal/profiler/python/requirements.txt"]}
doc_id = { file = ["universal/doc_id/python/requirements.txt"]}
filter = { file = ["universal/filter/python/requirements.txt"]}
resize = { file = ["universal/resize/python/requirements.txt"]}
tokenization = { file = ["universal/tokenization/python/requirements.txt"]}

######## Named transforms
doc_chunk = { file = ["language/doc_chunk/requirements.txt"]}
doc_quality = { file = ["language/doc_quality/requirements.txt"]}
html2parquet = { file = ["language/html2parquet/requirements.txt"]}
lang_id = { file = ["language/lang_id/requirements.txt"]}
pdf2parquet = { file = ["language/pdf2parquet/requirements.txt"]}
text_encoder = { file = ["language/text_encoder/requirements.txt"]}

doc_id = { file = ["universal/doc_id/requirements.txt"]}
hap = { file = ["universal/hap/requirements.txt"]}
web2parquet = { file = ["universal/web2parquet/requirements.txt"]}

# Does not seem to work for our custom layout
# copy all files to a single src and let automatic discovery find them

[tool.setuptools.package-data]
"*" = ["*.txt"]

# To include this, comment out the package.find section,
# uncomment the package-dir section and rerun the build
# while keeping the build folder from previous run
#[tool.setuptools.packages.find]
#where = ["src"]

# To include this, comment out the package.find section,
# uncomment the package-dir section and rerun the build
# while keeping the build folder from previous run
#[tool.setuptools.package-dir]
#dpk_pdf2parquet = "language/html2parquet/dpk_pdf2parquet"
#dpk_doc_chunck = "universal/doc_chunck/dpk_doc_chunk"
#dpk_text_encoder = "language/text_encoder/dpk_text_encoder"
#dpk_html2parquet = "language/html2parquet/dpk_html2parquet"
#dpk_web2parquet = "universal/web2parquet/dpk_web2parquet"
# When combing named modules with flat modules, need to run
# the build twice, once with the block below commented out
# and once after adding the lines below
[tool.setuptools.package-dir]
dpk_web2parquet = "universal/web2parquet/dpk_web2parquet"
dpk_doc_chunk = "language/doc_chunk/dpk_doc_chunk"
dpk_doc_quality = "language/doc_quality/dpk_doc_quality"
dpk_html2parquet = "language/html2parquet/dpk_html2parquet"
dpk_lang_id = "language/lang_id/dpk_lang_id"
dpk_pdf2parquet = "language/pdf2parquet/dpk_pdf2parquet"
dpk_text_encoder = "language/text_encoder/dpk_text_encoder"
dpk_doc_id = "universal/doc_id/dpk_doc_id"
dpk_hap = "universal/hap/dpk_hap"

#[tool.setuptools.package-data]
#"*" = ["*.txt"]

[options]
package_dir = ["src","test"]

[options.packages.find]
where = ["src"]

[tool.pytest.ini_options]
# Currently we use low coverage since we have to run tests separately (see makefile)
Expand Down
2 changes: 1 addition & 1 deletion transforms/requirements-ray.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
data-prep-toolkit[ray]>=0.2.4.dev0
data-prep-toolkit[ray]>=0.2.3
networkx==3.3
colorlog==6.8.2
func-timeout==4.3.5
Expand Down
2 changes: 1 addition & 1 deletion transforms/requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
data-prep-toolkit>=0.2.4.dev0
data-prep-toolkit>=0.2.3
60 changes: 2 additions & 58 deletions transforms/transforms-1.0-lang.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -161,62 +161,6 @@
"#table.to_pandas()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "38480cd5",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"10:56:59 INFO - exact dedup params are {'doc_column': 'contents', 'doc_id_column': 'document_id', 'use_snapshot': False, 'snapshot_directory': None}\n",
"10:56:59 INFO - pipeline id pipeline_id\n",
"10:56:59 INFO - code location None\n",
"10:56:59 INFO - data factory data_ is using local data access: input_folder - doc-chunk-files output_folder - dedup-files\n",
"10:56:59 INFO - data factory data_ max_files -1, n_sample -1\n",
"10:56:59 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
"10:56:59 INFO - orchestrator ededup started at 2024-12-14 10:56:59\n",
"10:56:59 INFO - Number of files is 1, source profile {'max_file_size': 0.03043651580810547, 'min_file_size': 0.03043651580810547, 'total_file_size': 0.03043651580810547}\n",
"10:56:59 INFO - Starting from the beginning\n",
"10:56:59 INFO - Completed 1 files (100.0%) in 0.0 min\n",
"10:56:59 INFO - Done processing 1 files, waiting for flush() completion.\n",
"10:56:59 INFO - done flushing in 0.0 sec\n",
"10:56:59 INFO - Completed execution in 0.0 min, execution result 0\n"
]
},
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from dpk_ededup.transform_python import Ededup\n",
"Ededup(input_folder=\"doc-chunk-files\",\n",
" output_folder=\"dedup-files\",\n",
" ededup_doc_column=\"contents\",\n",
" ededup_doc_id_column=\"document_id\").transform()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "27e36a8e",
"metadata": {},
"outputs": [],
"source": [
"##### **** To explote the output from eDedup, run the code below\n",
"#table = pq.read_table('dedup-files/arxiv_org_2408.09869v5.pdf_application.parquet')\n",
"#table.to_pandas()"
]
},
{
"cell_type": "code",
"execution_count": 12,
Expand Down Expand Up @@ -255,7 +199,7 @@
],
"source": [
"from dpk_lang_id.transform_python import LangId\n",
"LangId(input_folder= \"dedup-files\",\n",
"LangId(input_folder= \"doc-chunk-files\",\n",
" output_folder= \"langId-files\",\n",
" lang_id_model_credential= \"PUT YOUR OWN HUGGINGFACE CREDENTIAL\",\n",
" lang_id_model_kind= \"fasttext\",\n",
Expand Down Expand Up @@ -302,7 +246,7 @@
"source": [
"%%capture\n",
"from dpk_doc_quality.transform_python import DocQuality\n",
"DocQuality(input_folder='dedup-files',\n",
"DocQuality(input_folder='doc-chunk-files',\n",
" output_folder= 'doc-quality-files',\n",
" docq_text_lang = \"en\",\n",
" docq_doc_content_column =\"contents\").transform()"
Expand Down
1 change: 0 additions & 1 deletion transforms/universal/doc_id/requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +0,0 @@
data-prep-toolkit>=0.2.3
2 changes: 1 addition & 1 deletion transforms/universal/ededup/python/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
data-prep-toolkit>=0.2.4.dev0
data-prep-toolkit>=0.2.3
mmh3>=4.1.0
xxhash==3.4.1
2 changes: 1 addition & 1 deletion transforms/universal/fdedup/python/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
data-prep-toolkit>=0.2.4.dev0
data-prep-toolkit>=0.2.3
pyyaml>=6.0.2
boto3>=1.34.69
kubernetes>=30.1.0
Expand Down
2 changes: 1 addition & 1 deletion transforms/universal/fdedup/ray/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
data-prep-toolkit[ray]>=0.2.4.dev0
data-prep-toolkit[ray]>=0.2.3
dpk_fdedup_transform_python==0.2.4.dev0
mmh3>=4.1.0
xxhash==3.4.1
Expand Down
2 changes: 1 addition & 1 deletion transforms/universal/filter/python/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@

data-prep-toolkit>=0.2.4.dev0
data-prep-toolkit>=0.2.3
duckdb>=0.10.1
1 change: 0 additions & 1 deletion transforms/universal/hap/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
data-prep-toolkit>=0.2.3
nltk==3.9.1
transformers==4.38.2
torch>=2.2.2,<=2.4.1
Expand Down
2 changes: 1 addition & 1 deletion transforms/universal/profiler/python/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@

data-prep-toolkit>=0.2.4.dev0
data-prep-toolkit>=0.2.3
mmh3==4.1.0
xxhash==3.4.1

2 changes: 1 addition & 1 deletion transforms/universal/resize/python/requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
data-prep-toolkit>=0.2.4.dev0
data-prep-toolkit>=0.2.3
2 changes: 1 addition & 1 deletion transforms/universal/tokenization/python/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
data-prep-toolkit>=0.2.4.dev0
data-prep-toolkit>=0.2.3
transformers==4.38.2
1 change: 0 additions & 1 deletion transforms/universal/web2parquet/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1 @@
data-prep-toolkit>=0.2.3
data_prep_connector>=0.2.3

0 comments on commit 6a43092

Please sign in to comment.