Skip to content

Commit

Permalink
[Loaders] Improve web page and sitemap loader usability (#961)
Browse files Browse the repository at this point in the history
  • Loading branch information
deshraj authored Nov 17, 2023
1 parent 28460f7 commit e0b73e6
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 7 deletions.
3 changes: 2 additions & 1 deletion embedchain/loaders/sitemap.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import logging

import requests
from tqdm import tqdm

try:
from bs4 import BeautifulSoup
Expand Down Expand Up @@ -52,7 +53,7 @@ def load_link(link):

with concurrent.futures.ThreadPoolExecutor() as executor:
future_to_link = {executor.submit(load_link, link): link for link in links}
for future in concurrent.futures.as_completed(future_to_link):
for future in tqdm(concurrent.futures.as_completed(future_to_link), total=len(links)):
link = future_to_link[future]
try:
data = future.result()
Expand Down
16 changes: 11 additions & 5 deletions embedchain/loaders/web_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,17 @@

@register_deserializable
class WebPageLoader(BaseLoader):
# Shared session for all instances
_session = requests.Session()

def load_data(self, url):
"""Load data from a web page."""
response = requests.get(url)
"""Load data from a web page using a shared requests session."""
response = self._session.get(url, timeout=30)
response.raise_for_status()
data = response.content
content = self._get_clean_content(data, url)

meta_data = {
"url": url,
}
meta_data = {"url": url}

doc_id = hashlib.sha256((content + url).encode()).hexdigest()
return {
Expand Down Expand Up @@ -86,3 +88,7 @@ def _get_clean_content(self, html, url) -> str:
)

return content

@classmethod
def close_session(cls):
cls._session.close()
2 changes: 1 addition & 1 deletion tests/loaders/test_web_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def test_load_data(web_page_loader):
</body>
</html>
"""
with patch("embedchain.loaders.web_page.requests.get", return_value=mock_response):
with patch("embedchain.loaders.web_page.WebPageLoader._session.get", return_value=mock_response):
result = web_page_loader.load_data(page_url)

content = web_page_loader._get_clean_content(mock_response.content, page_url)
Expand Down

0 comments on commit e0b73e6

Please sign in to comment.