diff --git a/haystack/preview/components/fetchers/link_content.py b/haystack/preview/components/fetchers/link_content.py index 34815079f8..664f716291 100644 --- a/haystack/preview/components/fetchers/link_content.py +++ b/haystack/preview/components/fetchers/link_content.py @@ -58,10 +58,10 @@ def __init__( """ Initializes a LinkContentFetcher instance. - :param raise_on_failure: If True, raises an exception on failure when fetching a single URL. - For multiple URLs, errors are logged and successful fetches are returned. Default is True. + :param raise_on_failure: If True, raises an exception if it fails to fetch a single URL. + For multiple URLs, it logs errors and returns the content it successfully fetched. Default is True. :param user_agents: A list of user agents for fetching content. If None, a default user agent is used. - :param retry_attempts: Number of retry attempts for fetching content. Default is 2. + :param retry_attempts: Specifies how many times you want it to retry to fetch the URL's content. Default is 2. :param timeout: Timeout in seconds for the request. Default is 3. """ self.raise_on_failure = raise_on_failure @@ -100,6 +100,7 @@ def run(self, urls: List[str]): """ Fetches content from a list of URLs and returns a list of extracted content streams. Each content stream is a ByteStream object containing the extracted content as binary data. + Each ByteStream object in the returned list corresponds to the contents of a single URL. The content type of each stream is stored in the metadata of the ByteStream object under the key "content_type". The URL of the fetched content is stored under the key "url". @@ -137,7 +138,7 @@ def fetch(self, url: str) -> Tuple[Dict[str, str], ByteStream]: :param url: The URL to fetch content from. :return: A tuple containing the ByteStream metadata dict and the corresponding ByteStream. ByteStream metadata contains the URL and the content type of the fetched content. - The content type is a string indicating the type of content fetched (e.g., "text/html", "application/pdf"). + The content type is a string indicating the type of content fetched (for example, "text/html", "application/pdf"). The ByteStream object contains the fetched content as binary data. :raises: If an error occurs during content retrieval and `raise_on_failure` is set to True, this method will @@ -155,7 +156,7 @@ def fetch(self, url: str) -> Tuple[Dict[str, str], ByteStream]: if self.raise_on_failure: raise e # less verbose log as this is expected to happen often (requests failing, blocked, etc.) - logger.debug("Couldn't retrieve content from %s due to %s", url, str(e)) + logger.debug("Couldn't retrieve content from %s because %s", url, str(e)) finally: self.current_user_agent_idx = 0 @@ -166,8 +167,8 @@ def _fetch_with_exception_suppression(self, url: str) -> Tuple[Optional[Dict[str """ Fetches content from a URL and returns it as a ByteStream. - If `raise_on_failure` is set to True, this method will wrap the fetch method and catch any exceptions. - Otherwise, it will simply call the fetch method. + If `raise_on_failure` is set to True, this method will wrap the fetch() method and catch any exceptions. + Otherwise, it will simply call the fetch() method. :param url: The URL to fetch content from. :return: A tuple containing the ByteStream metadata dict and the corresponding ByteStream.