Skip to content

Commit

Permalink
Merge pull request #25 from meddhiaka/langchain-based-module
Browse files Browse the repository at this point in the history
text-embeddings-model-testcase
  • Loading branch information
meddhiaka authored Jul 28, 2024
2 parents 837b44d + e3e90ec commit acb20eb
Show file tree
Hide file tree
Showing 3 changed files with 349 additions and 1 deletion.
3 changes: 2 additions & 1 deletion llm-service/.gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
llm
__pycache__
.vscode
.env
.env
notebooks/.ipynb_checkpoints
225 changes: 225 additions & 0 deletions llm-service/notebooks/test-text-embeddings.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "0811b72d-3b82-4340-b7e3-74a0a8a96834",
"metadata": {},
"source": [
"# EXAMPLE OF ENCODING A HUMAN PHRASE INTO AN EMBEDDING VECTOR WITH AN OPEN SOURCE LLM\n",
"## without the need of OPENAI apis..."
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "716cc346-b4c5-4a77-b2e0-fc5612e2775b",
"metadata": {},
"outputs": [],
"source": [
"from sentence_transformers import SentenceTransformer"
]
},
{
"cell_type": "markdown",
"id": "d1b165c4-1d5a-4c0b-8580-2d37ba36eb44",
"metadata": {},
"source": [
"> chosing this model, because it is an upgraded version of google BERT, and lightweight stuff with a small size of about 90 mega byte"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "a785acb3-3bb9-4197-8081-7c7c96af666d",
"metadata": {},
"outputs": [],
"source": [
"embeddings_model_name = \"all-MiniLM-L6-v2\""
]
},
{
"cell_type": "markdown",
"id": "2e49ff98-7dcb-4534-8b20-eac3fd66a5fe",
"metadata": {},
"source": [
"> passing the model name to the sentence transformer"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "684dab14-82ed-4cf1-8c76-b0f38c6147a8",
"metadata": {},
"outputs": [],
"source": [
"model = SentenceTransformer(embeddings_model_name)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "4d9bf3c3-9c97-4a9a-96b7-267e7b1fbc95",
"metadata": {},
"outputs": [],
"source": [
"sentence = \"my name is Dhia\""
]
},
{
"cell_type": "markdown",
"id": "ce1ff01c-2170-4099-8afa-c0e667d94905",
"metadata": {},
"source": [
"> encoding the sentence into a vector of numbers, which is the represntation of the sentence into numerical things"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "8d653a65-6bec-4952-bb51-9295b814b5d7",
"metadata": {},
"outputs": [],
"source": [
"embeddings = model.encode(sentence)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "08d19237-071a-4113-93ee-7ba991e41702",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([-7.05236718e-02, -1.60931088e-02, -1.92479305e-02, 1.82103179e-02,\n",
" -1.19590990e-01, -9.18985754e-02, 7.51580819e-02, -3.63232493e-02,\n",
" 2.11442094e-02, -2.28920374e-02, 1.05619431e-02, -1.18252262e-01,\n",
" -2.40387581e-02, -1.79486386e-02, 1.70201752e-02, 4.35630493e-02,\n",
" 7.33184516e-02, 5.59519157e-02, -7.05158338e-02, -7.57793039e-02,\n",
" -8.79334472e-03, 5.13558164e-02, -7.51523860e-03, -5.52448332e-02,\n",
" -8.38933736e-02, 8.67997929e-02, 1.89609397e-02, 8.48655552e-02,\n",
" -5.67335002e-02, -5.80664948e-02, -5.84636070e-03, 5.19191995e-02,\n",
" 5.97721525e-02, 4.52650115e-02, 8.33715964e-03, 2.26841588e-02,\n",
" -9.92354378e-02, 4.08069119e-02, 8.32620338e-02, 1.73921639e-03,\n",
" 6.85867965e-02, -9.72167328e-02, 3.79107185e-02, -1.32512348e-02,\n",
" -5.76893380e-03, -2.60857865e-02, -1.97701287e-02, 7.47563541e-02,\n",
" 4.70534377e-02, 2.27136072e-02, -7.92886615e-02, -7.42600039e-02,\n",
" 6.13950146e-03, 4.79103960e-02, 1.74055323e-02, -9.38877538e-02,\n",
" 2.13661287e-02, 2.06255969e-02, 1.94132831e-02, 3.22155282e-02,\n",
" -6.01592623e-02, 5.16508967e-02, -7.85991475e-02, 3.60676162e-02,\n",
" 9.74804536e-03, -5.27368449e-02, -9.59350076e-03, -4.41584438e-02,\n",
" -6.48318965e-04, -3.79158370e-02, 2.23576929e-02, -1.42385310e-03,\n",
" 6.25500828e-02, 2.31287759e-02, -3.10945828e-02, -2.09828243e-02,\n",
" 1.75296701e-02, -4.64223474e-02, 2.65836529e-02, 6.10605814e-02,\n",
" -3.62458080e-02, 2.04698704e-02, -2.05210783e-02, 2.86100116e-02,\n",
" -1.22527052e-02, 2.14435309e-02, 1.04064203e-03, 1.54060200e-02,\n",
" -6.37610406e-02, -1.50689214e-01, 1.54529291e-03, 2.26672762e-03,\n",
" -2.75626015e-02, 3.23846228e-02, 3.29131000e-02, -4.94525656e-02,\n",
" 7.62648359e-02, 4.21432778e-03, -1.12518020e-01, 1.14961162e-01,\n",
" -7.31231039e-03, 6.93214545e-03, -3.16390581e-02, 2.70682182e-02,\n",
" -3.52935083e-02, 5.19661084e-02, 1.30002331e-02, -1.85961779e-02,\n",
" 8.98841023e-03, -1.53648190e-03, -8.08810666e-02, 7.99290370e-03,\n",
" -1.11755729e-01, 7.34393075e-02, 2.60063447e-02, -3.00781503e-02,\n",
" 1.59195736e-02, 6.77077919e-02, 1.40962610e-02, -1.24905646e-01,\n",
" -4.50927578e-02, 7.35280383e-03, -5.78532778e-02, 3.28116901e-02,\n",
" -2.02516932e-02, -3.56817171e-02, -3.46721523e-02, -3.62085945e-33,\n",
" -1.29694128e-02, 7.26205297e-03, 2.76725437e-03, 3.52143385e-02,\n",
" -4.85444441e-02, -4.48167659e-02, -8.27146322e-03, -3.28657916e-04,\n",
" -8.39053169e-02, -7.02462345e-02, -2.34898692e-03, 1.86828915e-02,\n",
" -3.00089289e-02, -5.24056479e-02, -1.07456446e-02, 5.05034253e-02,\n",
" 5.13908193e-02, 1.56075982e-02, 2.47835070e-02, 1.23068362e-01,\n",
" 2.50813505e-03, -3.79860289e-02, 1.87070146e-02, -8.99085402e-03,\n",
" -3.61227654e-02, -1.34114446e-02, 1.05591856e-01, -1.17392793e-01,\n",
" 2.12110449e-02, 3.96833718e-02, 1.47764748e-02, -4.92694490e-02,\n",
" 2.95680370e-02, -7.74936229e-02, -3.36638130e-02, 2.03125104e-02,\n",
" -5.83853722e-02, -8.70379284e-02, -7.95805454e-02, 1.48076564e-02,\n",
" -1.19074844e-02, 4.00276892e-02, -3.89190540e-02, -4.20734100e-02,\n",
" -2.39391346e-02, 6.52980804e-02, 7.65080899e-02, -1.13789849e-02,\n",
" -5.86649403e-02, 8.13412201e-03, -8.33811685e-02, -1.65410470e-02,\n",
" -6.14932030e-02, -2.17130911e-02, -1.13271929e-01, -4.89654131e-02,\n",
" 6.26277104e-02, 1.54007236e-02, -1.65686514e-02, -2.15057656e-02,\n",
" -6.42434955e-02, -1.46887125e-02, -5.56580238e-02, 6.99484870e-02,\n",
" -7.84253515e-03, 1.86246298e-02, 3.17439921e-02, -1.00163072e-02,\n",
" 6.39801621e-02, -2.55614556e-02, -2.92685926e-02, -1.47933355e-02,\n",
" 1.49056409e-02, 1.30354568e-01, 6.78878650e-03, 3.58305895e-03,\n",
" 3.56976092e-02, 1.39798652e-02, -9.37632099e-03, 5.42047657e-02,\n",
" 4.79877554e-02, 4.53870781e-02, 4.80338652e-03, -1.31423557e-02,\n",
" 6.36392608e-02, -2.41277069e-02, -2.35502645e-02, -1.66074801e-02,\n",
" -1.24190450e-01, -5.49397841e-02, 4.94708912e-03, 1.95235794e-03,\n",
" 6.09364957e-02, -6.59533544e-04, -6.15168409e-03, 7.21995702e-34,\n",
" -1.44108711e-02, -5.14261089e-02, -1.90891773e-02, -5.28545268e-02,\n",
" 1.62104964e-02, -5.11956662e-02, -4.65710973e-03, 1.26753300e-01,\n",
" 3.39262001e-02, 7.39510208e-02, 6.94529787e-02, -1.76188685e-02,\n",
" 3.66078466e-02, -4.40288149e-03, 5.23058251e-02, 2.44237743e-02,\n",
" 1.69343017e-02, 5.71275167e-02, -8.15093517e-03, -1.34522989e-02,\n",
" 3.23207903e-04, 1.04198314e-01, -1.17139429e-01, -4.00622450e-02,\n",
" 1.41051402e-02, -2.03309990e-02, 1.48287490e-02, 3.14377099e-02,\n",
" -4.37114909e-02, 2.24833433e-02, -5.97753329e-04, 2.32188255e-02,\n",
" -1.46858424e-01, -9.10286792e-03, 2.11682245e-02, -1.25621874e-02,\n",
" -4.95765656e-02, -1.85418781e-02, -7.11793872e-03, 1.49038865e-03,\n",
" -5.11692911e-02, 4.84717973e-02, 7.11453035e-02, 6.39875680e-02,\n",
" -6.86601736e-03, -8.30386505e-02, 1.70927644e-02, 5.09319864e-02,\n",
" 2.93617956e-02, 1.12425182e-02, -1.82068311e-02, -2.58533307e-03,\n",
" -3.19009870e-02, -4.60386239e-02, 1.04666822e-01, 5.33611439e-02,\n",
" 8.56694952e-02, 3.94864194e-03, -4.87567373e-02, -3.91369574e-02,\n",
" -7.97590986e-02, -2.88418569e-02, 3.31391245e-02, 1.20584793e-01,\n",
" 2.34594550e-02, 2.30826046e-02, -2.40478870e-02, -8.14548060e-02,\n",
" -1.21153519e-02, -1.62131917e-02, 3.27826701e-02, 3.36454883e-02,\n",
" -1.05187617e-01, 1.69765316e-02, -4.42352369e-02, -7.09058046e-02,\n",
" -2.72821548e-04, 7.72199854e-02, -1.39463013e-02, 7.12679476e-02,\n",
" 2.28652023e-02, 2.50755297e-03, -2.53640078e-02, 8.83515365e-03,\n",
" 6.56417459e-02, 3.19932923e-02, 1.31409436e-01, 5.93245812e-02,\n",
" -2.94368137e-02, -7.80986948e-03, 7.31366202e-02, 4.08905931e-02,\n",
" 2.38449667e-02, -6.82127774e-02, -8.14835131e-02, -1.51396993e-08,\n",
" -2.56871562e-02, -1.18662650e-02, 1.09907039e-01, 4.61730249e-02,\n",
" -2.53012348e-02, 3.46657410e-02, 1.33930398e-02, 3.80384549e-02,\n",
" 7.26362467e-02, -1.38223963e-03, 8.62098560e-02, 5.70142828e-02,\n",
" 1.13379443e-02, 9.52571630e-03, 6.13231137e-02, -3.78460698e-02,\n",
" 2.51035225e-02, 8.41751322e-02, 2.75778752e-02, -3.47662866e-02,\n",
" 3.33757326e-02, -6.74158707e-02, 3.61631736e-02, -2.23071817e-02,\n",
" 3.92968114e-03, 9.06798337e-03, 8.66555572e-02, -1.49956634e-02,\n",
" -4.32920344e-02, 4.94523626e-03, 6.09504152e-03, 9.25101787e-02,\n",
" 2.55701747e-02, -1.02101257e-02, -1.86148845e-02, 6.98827859e-03,\n",
" 2.56543122e-02, -1.53958295e-02, 2.04253513e-02, -2.09278129e-02,\n",
" 4.95365402e-03, 2.19482724e-02, 7.31995478e-02, 2.31853016e-02,\n",
" -6.91349898e-03, 8.19720402e-02, 6.52711317e-02, -8.28643218e-02,\n",
" 2.28634961e-02, -5.71966954e-02, -4.23230045e-02, -1.15389256e-02,\n",
" 1.22872971e-01, 2.11112667e-02, 4.59483713e-02, 4.16709743e-02,\n",
" -9.27364919e-03, 6.08901754e-02, 4.17120801e-03, -3.22093256e-03,\n",
" 1.36247456e-01, -5.00843301e-02, -7.55917132e-02, -2.26034448e-02],\n",
" dtype=float32)"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"embeddings"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
122 changes: 122 additions & 0 deletions llm-service/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,33 +1,155 @@
aiohttp==3.9.5
aiosignal==1.3.1
annotated-types==0.7.0
anyio==4.4.0
argon2-cffi==23.1.0
argon2-cffi-bindings==21.2.0
arrow==1.3.0
asttokens==2.4.1
async-lru==2.0.4
attrs==23.2.0
Babel==2.15.0
beautifulsoup4==4.12.3
bleach==6.1.0
certifi==2024.7.4
cffi==1.16.0
charset-normalizer==3.3.2
click==8.1.7
colorama==0.4.6
comm==0.2.2
debugpy==1.8.2
decorator==5.1.1
defusedxml==0.7.1
dnspython==2.6.1
email_validator==2.2.0
executing==2.0.1
fastapi==0.111.1
fastapi-cli==0.0.4
fastjsonschema==2.20.0
filelock==3.15.4
fqdn==1.5.1
frozenlist==1.4.1
fsspec==2024.6.1
greenlet==3.0.3
h11==0.14.0
httpcore==1.0.5
httptools==0.6.1
httpx==0.27.0
huggingface-hub==0.24.2
idna==3.7
intel-openmp==2021.4.0
ipykernel==6.29.5
ipython==8.26.0
ipywidgets==8.1.3
isoduration==20.11.0
jedi==0.19.1
Jinja2==3.1.4
joblib==1.4.2
json5==0.9.25
jsonpatch==1.33
jsonpointer==3.0.0
jsonschema==4.23.0
jsonschema-specifications==2023.12.1
jupyter==1.0.0
jupyter-console==6.6.3
jupyter-events==0.10.0
jupyter-lsp==2.2.5
jupyter_client==8.6.2
jupyter_core==5.7.2
jupyter_server==2.14.2
jupyter_server_terminals==0.5.3
jupyterlab==4.2.4
jupyterlab_pygments==0.3.0
jupyterlab_server==2.27.3
jupyterlab_widgets==3.0.11
langchain==0.2.11
langchain-core==0.2.24
langchain-text-splitters==0.2.2
langsmith==0.1.93
markdown-it-py==3.0.0
MarkupSafe==2.1.5
matplotlib-inline==0.1.7
mdurl==0.1.2
mistune==3.0.2
mkl==2021.4.0
mpmath==1.3.0
multidict==6.0.5
nbclient==0.10.0
nbconvert==7.16.4
nbformat==5.10.4
nest-asyncio==1.6.0
networkx==3.3
notebook==7.2.1
notebook_shim==0.2.4
numpy==1.26.4
orjson==3.10.6
overrides==7.7.0
packaging==24.1
pandocfilters==1.5.1
parso==0.8.4
pillow==10.4.0
platformdirs==4.2.2
prometheus_client==0.20.0
prompt_toolkit==3.0.47
psutil==6.0.0
pure_eval==0.2.3
pycparser==2.22
pydantic==2.8.2
pydantic_core==2.20.1
Pygments==2.18.0
python-dateutil==2.9.0.post0
python-dotenv==1.0.1
python-json-logger==2.0.7
python-multipart==0.0.9
pywin32==306
pywinpty==2.0.13
PyYAML==6.0.1
pyzmq==26.0.3
qtconsole==5.5.2
QtPy==2.4.1
referencing==0.35.1
regex==2024.7.24
requests==2.32.3
rfc3339-validator==0.1.4
rfc3986-validator==0.1.1
rich==13.7.1
rpds-py==0.19.1
safetensors==0.4.3
scikit-learn==1.5.1
scipy==1.14.0
Send2Trash==1.8.3
sentence-transformers==3.0.1
setuptools==71.1.0
shellingham==1.5.4
six==1.16.0
sniffio==1.3.1
soupsieve==2.5
SQLAlchemy==2.0.31
stack-data==0.6.3
starlette==0.37.2
sympy==1.13.1
tbb==2021.13.0
tenacity==8.5.0
terminado==0.18.1
threadpoolctl==3.5.0
tinycss2==1.3.0
tokenizers==0.19.1
torch==2.3.0
tornado==6.4.1
tqdm==4.66.4
traitlets==5.14.3
transformers==4.43.3
typer==0.12.3
types-python-dateutil==2.9.0.20240316
typing_extensions==4.12.2
uri-template==1.3.0
urllib3==2.2.2
uvicorn==0.30.3
watchfiles==0.22.0
wcwidth==0.2.13
webcolors==24.6.0
webencodings==0.5.1
websocket-client==1.8.0
websockets==12.0
widgetsnbextension==4.0.11
yarl==1.9.4

0 comments on commit acb20eb

Please sign in to comment.