-
Notifications
You must be signed in to change notification settings - Fork 53
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
6 changed files
with
318 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
{ | ||
"cells": [], | ||
"metadata": {}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,312 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"id": "41a4d2f7", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from langchain_experimental.graph_transformers.diffbot import DiffbotGraphTransformer\n", | ||
"\n", | ||
"diffbot_api_key = \"be276bd006c2d79566682b34eb1095e4\"\n", | ||
"diffbot_nlp = DiffbotGraphTransformer(diffbot_api_key=diffbot_api_key)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"id": "305c75b7", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"'DIFFBOT_API_KEY'" | ||
] | ||
}, | ||
"execution_count": 5, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"diffbot_nlp.diffbot_api_key" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 51, | ||
"id": "8bb5dbaf", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/wikipedia/wikipedia.py:389: GuessedAtParserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system (\"html.parser\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n", | ||
"\n", | ||
"The code that caused this warning is on line 389 of the file /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/wikipedia/wikipedia.py. To get rid of this warning, pass the additional argument 'features=\"html.parser\"' to the BeautifulSoup constructor.\n", | ||
"\n", | ||
" lis = BeautifulSoup(html).find_all('li')\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"from langchain.document_loaders import WikipediaLoader\n", | ||
"\n", | ||
"query = \"Mahabharata\"\n", | ||
"raw_documents = WikipediaLoader(query=query).load()\n", | ||
"graph_documents = diffbot_nlp.convert_to_graph_documents(raw_documents)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 52, | ||
"id": "7529b1ab", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from langchain.graphs import Neo4jGraph\n", | ||
"\n", | ||
"url=\"bolt://localhost:7687\"\n", | ||
"username=\"neo4j\"\n", | ||
"password=\"Rambo@1234\"\n", | ||
"\n", | ||
"graph = Neo4jGraph(\n", | ||
" url=url,\n", | ||
" username=username, \n", | ||
" password=password\n", | ||
")\n", | ||
"\n", | ||
"graph.add_graph_documents(graph_documents)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 42, | ||
"id": "e447125a", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from langchain.chains import GraphCypherQAChain\n", | ||
"from langchain.chat_models import ChatOpenAI\n", | ||
"\n", | ||
"chain = GraphCypherQAChain.from_llm(\n", | ||
" cypher_llm=ChatOpenAI(openai_api_key=\"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\",temperature=0, model_name=\"gpt-3.5-turbo\"),\n", | ||
" qa_llm=ChatOpenAI(openai_api_key=\"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\",temperature=0, model_name=\"gpt-3.5-turbo\"),\n", | ||
" graph=graph, verbose=True,\n", | ||
" \n", | ||
")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 48, | ||
"id": "0ec15fde", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"\n", | ||
"\n", | ||
"\u001b[1m> Entering new GraphCypherQAChain chain...\u001b[0m\n", | ||
"Generated Cypher:\n", | ||
"\u001b[32;1m\u001b[1;3mMATCH (p:Person {name: 'Elon Musk'})\n", | ||
"RETURN p\u001b[0m\n", | ||
"Full Context:\n", | ||
"\u001b[32;1m\u001b[1;3m[{'p': {'name': 'Elon Musk', 'dateOfBirth': '1971-06-28', 'id': 'http://www.wikidata.org/entity/Q317521', 'positionHeld': 'chairman'}}]\u001b[0m\n", | ||
"\n", | ||
"\u001b[1m> Finished chain.\u001b[0m\n" | ||
] | ||
}, | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"'Elon Musk is the chairman of a certain organization. He was born on June 28, 1971.'" | ||
] | ||
}, | ||
"execution_count": 48, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"chain.run(\"show information on Elon Musk\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 53, | ||
"id": "1c14a4f2", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from langchain.document_loaders import youtube" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 56, | ||
"id": "6c8753b9", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"loader=youtube.YoutubeLoader.from_youtube_url(\"https://www.youtube.com/watch?v=AzCquTh4MUw\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 60, | ||
"id": "707f8ffe", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"[Document(page_content=\"good morning guys today i'm gonna be talking about a very interesting topic that you're not gonna find on youtube or any blogs i'm gonna teach you how to combine semantic search with the power of elasticsearch to develop powerful search engines so essentially uh with semantic search now you can say hey i'm looking for something like this it understands that and it would automatically query our petabyte scale elasticsearch data and give you the relevant result this is a very very interesting it's a hybrid solution so let me show you a small demo and uh talk more about that okay semantic search semantic search seeks to improve search accuracy by understanding the content of the search query in contrast to traditional search engines which only finds document based on lexical matches semantic search can also find synonyms this is really amazing now um so i want to show you a demo that i have so i'll walk you over the code right but uh let me show the data set so this data set have been taken from kaggle as you can see job posting data set so this is where i have taken this data set from now i'll walk you over the architecture diagram how you want to do things in production as well but for now let's see the action right this is beautiful as i said okay um so i'm gonna uh i'm gonna remove the company i just wanna show you a title so say uh so i have trained on on the job posting right now so now i can say hey i am i am looking for software developer or developer in new york uh so let's say i i entered the search query right let's see how it works on elasticsearch look at this java software engineer.net senior developer web developer i mean this is amazing the power of semantic search and elastic search that's it right you can build extremely extremely powerful um search engines on your data that you have so how i did all that i'm just gonna give you a high level overview and then i'll talk about some of the good practices we could do or how we could implement this in production right um so i took the data set right um i'm still cleaning up the code i would try to put this entire code in the uh my blog section so make sure to check that out so i essentially um you know so um the first thing that i did is i downloaded the model right from um from the website once i did that i essentially took the job posting data set there's a job post which is a string post which has job description salary job title everything right now what i did is essentially i took a random batch of data a random 500 sample i made a class called tokenizer that can tokenize my string job posting so then what i said is i i essentially uh over here as you can see job post i took all that and i applied a transformation so what that happened is essentially that created essentially a vectors for the entire job post string so now i did that now what i wanted to do is essentially um i created an index on elasticsearch with the k n k n mapping right there uh the dimension is 384 because my length of the vector is 384 the model that i'm using give gives me a vector of size 384 i had to flatten the vector because you cannot use a dense um uh vector on on on elk you have to flatten it out so i did that um after i did that then what i did is i uploaded the sample data set on elasticsearch with the appropriate mapping after i did that then and then i'm saying that hey enter the input or search query so the user is going to type job posting or you know whatever they are looking for and then what i simply did is i i convert that user whatever they type into a token or a vector and then i pass that vector to my elasticsearch knm right i say hey give me all the items that are related to that vector and find the k is equal to 20 nearest neighbors on that this is just the beginning as i said you could add tags now you could get so much relevant result we could also implement a ranking system using uh a leucine script which means um older the record the less relevant they are so we could do all that amazing thing on elasticsearch as well but this is as i said just a demo so as i can see as you can see i said hey i'm looking for software developer or a web web developer in new york the query the the the with the word embedding it's so smart enough that it from the entire job posting it knew that okay this guy is looking for software engineer or web developer so php developer is also web because they can develop websites using php so it gave me that so it's pretty pretty smart to be honest and for example let's say i am looking for chief i think i have a spelling mistake ah chief okay let's say i'm looking for chief financial officer in in new york so chief risk officer you know uh chief accountant secretary uh junior ma then um we have some more recent senior consultant um so it's it's working pretty fine and remember this is my very first attempt right we could add tags and improve uh the search as well so uh that's how you would do it now let's take a look at the architecture on how you would try to implement if you had to do this in production so the way this would work is essentially um the user will type a query like hey i'm looking for this that blah blah blah that will go hit an api gateway that will hit your microservice lambda the lambda would pull the model from s3 and it would convert it into a word embedding so the word embedding is created then the word embedding is passed through the elastic search uh where the vectors are matched and the appropriate tags could be added on the bull section or you could do you could add similar so you could also fuse this with essentially okay give me all the people that match my semantic search and filter out by a geo location this on that so all that amazing beautiful thing is possible with a hybrid solution as i said so you are essentially using elasticsearch for performing um searches and matching vectors and you are using your word embedding model so now tomorrow um if you if you wanna so the only downside with this approach is say you wanna change your model right so if the model will emit uh essentially the length of the vector uh it's not same what is defined on the elasticsearch mapping you've got a v index you gotta reindex all that data which is a problem but this problem could you could overcome this problem with the second solution what i'm about to propose is so um in this solution what we did is we up we appended all the vectors as well on the index so the second solution is to create an auxiliary index with just a job posting and essentially you know grab keywords tokens and then pass through the main index that is storing the data so now you form the query based on the similar item that you got from the auxiliary index right so um that's that but i hope you have enjoyed this small walkthrough of semantic search and um as i said um elasticsearch a hybrid solution that you could use for search engines let's try searching something else um let's see uh i am looking for an ios developer or and android let's see what it returns senior ios wow that's amazing i never said to him that i need senior ios i just said i'm looking for ios or android so it came senior ios developer senior ios developer so it's pretty interesting right uh the way this works uh it's pretty fascinating to me at least um as i said you could filter out unrelated data using tags as well so whatever user pass pass passes in the the string you could essentially then pass remove the stopwatch uh you could uh you know once you remove the stopwatch then you could add all the keywords as a tag in the or section in the query in the elastic side ios or developer and then of course you can add the vector with so all these things once you have your query properly you can get really really good results with that so well this is how you would use semantic search combined with elasticsearch right you want to convert things into vector then put the vectors on the elastic search and then you can perform matching on that well hope you have enjoyed it if you have any more questions on um this uh let me know i don't think you will find these sort of contents on on the internet you might find something on elasticsearch maybe on semantic but you won't find something that shows everything semantic elasticsearch from end to end right um so yeah hope you have enjoyed it i'll try to move i'll try to make more and more videos on these sort of topics so a lot of people can use it and remember guys lot of big companies are using similar techniques to do their searches right for example i'm talking about um zillow or any any big company that you see a website they are using this technology you can type fuzzy strings and behind there's an nlp engine that understands that and then performs the vector matching and gives you the uh for example amazon right when you say hey i'm looking for this it tells you hey uh suggestion right hey do you wanna are you interested in this this this similar items right so all this is implemented this way essentially all right thank you so much for watching hope you have enjoyed it if you have enjoyed it do let me know in the comment section and i would see you in the next video\", metadata={'source': 'AzCquTh4MUw'})]" | ||
] | ||
}, | ||
"execution_count": 60, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"loader.load()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 59, | ||
"id": "0baa3b89", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Collecting youtube_transcript_api\n", | ||
" Downloading youtube_transcript_api-0.6.1-py3-none-any.whl (24 kB)\n", | ||
"Requirement already satisfied: requests in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from youtube_transcript_api) (2.31.0)\n", | ||
"Requirement already satisfied: charset-normalizer<4,>=2 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from requests->youtube_transcript_api) (3.1.0)\n", | ||
"Requirement already satisfied: idna<4,>=2.5 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from requests->youtube_transcript_api) (3.4)\n", | ||
"Requirement already satisfied: urllib3<3,>=1.21.1 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from requests->youtube_transcript_api) (1.26.14)\n", | ||
"Requirement already satisfied: certifi>=2017.4.17 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from requests->youtube_transcript_api) (2023.5.7)\n", | ||
"Installing collected packages: youtube_transcript_api\n", | ||
"Successfully installed youtube_transcript_api-0.6.1\n", | ||
"\n", | ||
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.2.1\u001b[0m\n", | ||
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip3 install --upgrade pip\u001b[0m\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"!pip3 install youtube_transcript_api" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 63, | ||
"id": "5e1827a3", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from langchain.chains.qa_generation import base" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 65, | ||
"id": "34be9170", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"base.QAGenerationChain?" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 66, | ||
"id": "9142b88e", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"WikipediaLoader?" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 67, | ||
"id": "e3507a7b", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"module" | ||
] | ||
}, | ||
"execution_count": 67, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"type(youtube)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 69, | ||
"id": "6704c488", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"method" | ||
] | ||
}, | ||
"execution_count": 69, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"type(youtube.YoutubeLoader.from_youtube_url)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "4bd299c8", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.11.1" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
Binary file not shown.
Binary file not shown.
Binary file not shown.