From 2d76cebd4f412d52cc250b11fdf132aefcf74f61 Mon Sep 17 00:00:00 2001
From: szymon <szymond@pathway.com>
Date: Tue, 20 Feb 2024 11:29:07 +0100
Subject: [PATCH] User guide for LLM xpack (#5628)

GitOrigin-RevId: 6c531c20328928ab9cbb9e52bb1d10b131d50400
---
 pyproject.toml                            |   2 +
 python/pathway/xpacks/llm/embedders.py    | 111 ++++----
 python/pathway/xpacks/llm/llms.py         | 316 ++++++++++++----------
 python/pathway/xpacks/llm/parsers.py      |   2 +-
 python/pathway/xpacks/llm/splitters.py    |  24 +-
 python/pathway/xpacks/llm/vector_store.py |  40 ++-
 6 files changed, 292 insertions(+), 203 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index addece0e..62ff8187 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -55,6 +55,8 @@ tests = [
     "litellm >= 1.0",
     "llama_index >= 0.9, < 0.10",
     "tiktoken >= 0.5",
+    "sentence_transformers",
+    "transformers"
 ]
 
 [project.urls]
diff --git a/python/pathway/xpacks/llm/embedders.py b/python/pathway/xpacks/llm/embedders.py
index 7c9cfafa..45c938e9 100644
--- a/python/pathway/xpacks/llm/embedders.py
+++ b/python/pathway/xpacks/llm/embedders.py
@@ -16,33 +16,34 @@ class OpenAIEmbedder(pw.UDFAsync):
     The capacity, retry_strategy and cache_strategy need to be specified during object
     construction. All other arguments can be overridden during application.
 
-    Parameters:
-    - capacity: Maximum number of concurrent operations allowed.
-        Defaults to None, indicating no specific limit.
-    - retry_strategy: Strategy for handling retries in case of failures.
-        Defaults to None.
-    - cache_strategy: Defines the caching mechanism. If set to None and a persistency
-        is enabled, operations will be cached using the persistence layer.
-        Defaults to None.
-    - model: ID of the model to use. You can use the
-        [List models](https://platform.openai.com/docs/api-reference/models/list) API to
-        see all of your available models, or see
-        [Model overview](https://platform.openai.com/docs/models/overview) for
-        descriptions of them.
-    - encoding_format: The format to return the embeddings in. Can be either `float` or
-        [`base64`](https://pypi.org/project/pybase64/).
-    - user: A unique identifier representing your end-user, which can help OpenAI to monitor
-        and detect abuse.
-        [Learn more](https://platform.openai.com/docs/guides/safety-best-practices/end-user-ids).
-    - extra_headers: Send extra headers
-    - extra_query: Add additional query parameters to the request
-    - extra_body: Add additional JSON properties to the request
-    - timeout: Timeout for requests, in seconds
+    Args:
+        - capacity: Maximum number of concurrent operations allowed.
+            Defaults to None, indicating no specific limit.
+        - retry_strategy: Strategy for handling retries in case of failures.
+            Defaults to None.
+        - cache_strategy: Defines the caching mechanism. If set to None and a persistency
+            is enabled, operations will be cached using the persistence layer.
+            Defaults to None.
+        - model: ID of the model to use. You can use the
+            `List models <https://platform.openai.com/docs/api-reference/models/list>`_ API to
+            see all of your available models, or see
+            `Model overview <https://platform.openai.com/docs/models/overview>`_ for
+            descriptions of them.
+        - encoding_format: The format to return the embeddings in. Can be either `float` or
+            `base64 <https://pypi.org/project/pybase64/>`_.
+        - user: A unique identifier representing your end-user, which can help OpenAI to monitor
+            and detect abuse.
+            `Learn more <https://platform.openai.com/docs/guides/safety-best-practices/end-user-ids>`_.
+        - extra_headers: Send extra headers
+        - extra_query: Add additional query parameters to the request
+        - extra_body: Add additional JSON properties to the request
+        - timeout: Timeout for requests, in seconds
 
     Any arguments can be provided either to the constructor or in the UDF call.
     To specify the `model` in the UDF call, set it to None.
 
-    Examples:
+    Example:
+
     >>> import pathway as pw
     >>> from pathway.xpacks.llm import embedders
     >>> embedder = embedders.OpenAIEmbedder(model="text-embedding-ada-002")
@@ -85,7 +86,7 @@ def __init__(
     async def __wrapped__(self, input, **kwargs) -> list[float]:
         """Embed the documents
 
-        Parameters:
+        Args:
             - input: mandatory, the string to embed.
             - **kwargs: optional parameters, if unset defaults from the constructor
               will be taken.
@@ -104,29 +105,30 @@ class LiteLLMEmbedder(pw.UDFAsync):
     is provided. The capacity, retry_strategy and cache_strategy need to be specified
     during object construction. All other arguments can be overridden during application.
 
-    Parameters:
-    - capacity: Maximum number of concurrent operations allowed.
-        Defaults to None, indicating no specific limit.
-    - retry_strategy: Strategy for handling retries in case of failures.
-        Defaults to None.
-    - cache_strategy: Defines the caching mechanism. If set to None and a persistency
-        is enabled, operations will be cached using the persistence layer.
-        Defaults to None.
-    - model: The embedding model to use.
-    - timeout: The timeout value for the API call, default 10 mins
-    - litellm_call_id: The call ID for litellm logging.
-    - litellm_logging_obj: The litellm logging object.
-    - logger_fn: The logger function.
-    - api_base: Optional. The base URL for the API.
-    - api_version: Optional. The version of the API.
-    - api_key: Optional. The API key to use.
-    - api_type: Optional. The type of the API.
-    - custom_llm_provider: The custom llm provider.
+    Args:
+        - capacity: Maximum number of concurrent operations allowed.
+            Defaults to None, indicating no specific limit.
+        - retry_strategy: Strategy for handling retries in case of failures.
+            Defaults to None.
+        - cache_strategy: Defines the caching mechanism. If set to None and a persistency
+            is enabled, operations will be cached using the persistence layer.
+            Defaults to None.
+        - model: The embedding model to use.
+        - timeout: The timeout value for the API call, default 10 mins
+        - litellm_call_id: The call ID for litellm logging.
+        - litellm_logging_obj: The litellm logging object.
+        - logger_fn: The logger function.
+        - api_base: Optional. The base URL for the API.
+        - api_version: Optional. The version of the API.
+        - api_key: Optional. The API key to use.
+        - api_type: Optional. The type of the API.
+        - custom_llm_provider: The custom llm provider.
 
     Any arguments can be provided either to the constructor or in the UDF call.
     To specify the `model` in the UDF call, set it to None.
 
-    Examples:
+    Example:
+
     >>> import pathway as pw
     >>> from pathway.xpacks.llm import embedders
     >>> embedder = embedders.LiteLLMEmbedder(model="text-embedding-ada-002")
@@ -169,7 +171,7 @@ def __init__(
     async def __wrapped__(self, input, **kwargs) -> list[float]:
         """Embed the documents
 
-        Parameters:
+        Args:
             - input: mandatory, the string to embed.
             - **kwargs: optional parameters, if unset defaults from the constructor
               will be taken.
@@ -187,11 +189,25 @@ class SentenceTransformerEmbedder(pw.UDFSync):
         model: model name or path
         call_kwargs: kwargs that will be passed to each call of encode.
             These can be overridden during each application. For possible arguments check
-            [the Sentence-Transformers documentation](https://www.sbert.net/docs/package_reference/SentenceTransformer.html#sentence_transformers.SentenceTransformer.encode).
+            `the Sentence-Transformers documentation
+            <https://www.sbert.net/docs/package_reference/SentenceTransformer.html#sentence_transformers.SentenceTransformer.encode>`_.
         device: defines which device will be used to run the Pipeline
         sentencetransformer_kwargs: kwargs accepted during initialization of SentenceTransformers.
             For possible arguments check
-            [the Sentence-Transformers documentation](https://www.sbert.net/docs/package_reference/SentenceTransformer.html#sentence_transformers.SentenceTransformer)
+            `the Sentence-Transformers documentation
+            <https://www.sbert.net/docs/package_reference/SentenceTransformer.html#sentence_transformers.SentenceTransformer>`_
+
+    Example:
+
+    >>> import pathway as pw
+    >>> from pathway.xpacks.llm import embedders
+    >>> embedder = embedders.SentenceTransformerEmbedder(model="intfloat/e5-large-v2")
+    >>> t = pw.debug.table_from_markdown('''
+    ... txt
+    ... Text
+    ... ''')
+    >>> t.select(ret=embedder(pw.this.txt))
+    <pathway.Table schema={'ret': list[float]}>
     """  # noqa: E501
 
     def __init__(
@@ -222,7 +238,8 @@ def __wrapped__(self, text: str, **kwargs) -> list[float]:
             - input: mandatory, the string to embed.
             - **kwargs: optional parameters for `encode` method. If unset defaults from the constructor
               will be taken. For possible arguments check
-              [the Sentence-Transformers documentation](https://www.sbert.net/docs/package_reference/SentenceTransformer.html#sentence_transformers.SentenceTransformer.encode).
+              `the Sentence-Transformers documentation
+              <https://www.sbert.net/docs/package_reference/SentenceTransformer.html#sentence_transformers.SentenceTransformer.encode>`_.
         """  # noqa: E501
         kwargs = {**self.kwargs, **kwargs}
         return self.model.encode(text, **kwargs).tolist()
diff --git a/python/pathway/xpacks/llm/llms.py b/python/pathway/xpacks/llm/llms.py
index b70fa04d..7b981686 100644
--- a/python/pathway/xpacks/llm/llms.py
+++ b/python/pathway/xpacks/llm/llms.py
@@ -1,6 +1,8 @@
 # Copyright © 2024 Pathway
 """
-UDF for calling LLMs:
+Pathway UDFs for calling LLMs
+
+This momdule contains UDFs for calling LLMs chat services:
 1. wrappers over LLM APIs
 2. prompt building tools
 """
@@ -18,128 +20,129 @@ class OpenAIChat(pw.UDFAsync):
     The capacity, retry_strategy and cache_strategy need to be specified during object
     construction. All other arguments can be overridden during application.
 
-    Parameters:
-    - capacity: Maximum number of concurrent operations allowed.
-        Defaults to None, indicating no specific limit.
-    - retry_strategy: Strategy for handling retries in case of failures.
-        Defaults to None.
-    - cache_strategy: Defines the caching mechanism. If set to None and a persistency
-        is enabled, operations will be cached using the persistence layer.
-        Defaults to None.
-    - model: ID of the model to use. See the
-      [model endpoint compatibility](https://platform.openai.com/docs/models/model-endpoint-compatibility)
-      table for details on which models work with the Chat API.
-    - frequency_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on their
-        existing frequency in the text so far, decreasing the model's likelihood to
-        repeat the same line verbatim.
-
-        [See more information about frequency and presence penalties.](
-            https://platform.openai.com/docs/guides/text-generation/parameter-details)
-    - function_call: Deprecated in favor of `tool_choice`.
-
-        Controls which (if any) function is called by the model. `none` means the model
-        will not call a function and instead generates a message. `auto` means the model
-        can pick between generating a message or calling a function. Specifying a
-        particular function via `{"name": "my_function"}` forces the model to call that
-        function.
-
-        `none` is the default when no functions are present. `auto` is the default if
-        functions are present.
-    - functions: Deprecated in favor of `tools`.
-
-        A list of functions the model may generate JSON inputs for.
-    - logit_bias: Modify the likelihood of specified tokens appearing in the completion.
-
-        Accepts a JSON object that maps tokens (specified by their token ID in the
-        tokenizer) to an associated bias value from -100 to 100. Mathematically, the
-        bias is added to the logits generated by the model prior to sampling. The exact
-        effect will vary per model, but values between -1 and 1 should decrease or
-        increase likelihood of selection; values like -100 or 100 should result in a ban
-        or exclusive selection of the relevant token.
-    - logprobs: Whether to return log probabilities of the output tokens or not. If true,
-        returns the log probabilities of each output token returned in the `content` of
-        `message`. This option is currently not available on the `gpt-4-vision-preview`
-        model.
-    - max_tokens: The maximum number of [tokens](/tokenizer) that can be generated in the chat
-        completion.
-
-        The total length of input tokens and generated tokens is limited by the model's
-        context length.
-        [Example Python code](https://cookbook.openai.com/examples/how_to_count_tokens_with_tiktoken)
-        for counting tokens.
-    - n: How many chat completion choices to generate for each input message. Note that
-        you will be charged based on the number of generated tokens across all of the
-        choices. Keep `n` as `1` to minimize costs.
-    - presence_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on
-        whether they appear in the text so far, increasing the model's likelihood to
-        talk about new topics.
-
-        [See more information about frequency and presence penalties.](
-            https://platform.openai.com/docs/guides/text-generation/parameter-details)
-    - response_format: An object specifying the format that the model must output. Compatible with
-        `gpt-4-1106-preview` and `gpt-3.5-turbo-1106`.
-
-        Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the
-        message the model generates is valid JSON.
-
-        **Important:** when using JSON mode, you **must** also instruct the model to
-        produce JSON yourself via a system or user message. Without this, the model may
-        generate an unending stream of whitespace until the generation reaches the token
-        limit, resulting in a long-running and seemingly "stuck" request. Also note that
-        the message content may be partially cut off if `finish_reason="length"`, which
-        indicates the generation exceeded `max_tokens` or the conversation exceeded the
-        max context length.
-    - seed: This feature is in Beta. If specified, our system will make a best effort to
-        sample deterministically, such that repeated requests with the same `seed` and
-        parameters should return the same result. Determinism is not guaranteed, and you
-        should refer to the `system_fingerprint` response parameter to monitor changes
-        in the backend.
-    - stop: Up to 4 sequences where the API will stop generating further tokens.
-    - stream: If set, partial message deltas will be sent, like in ChatGPT. Tokens will be
-        sent as data-only
-        [server-sent events](
-            https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format)
-        as they become available, with the stream terminated by a `data: [DONE]`
-        message.
-        [Example Python code](https://cookbook.openai.com/examples/how_to_stream_completions).
-    - temperature: What sampling temperature to use, between 0 and 2. Higher values like 0.8 will
-        make the output more random, while lower values like 0.2 will make it more
-        focused and deterministic.
-
-        We generally recommend altering this or `top_p` but not both.
-    - tool_choice: Controls which (if any) function is called by the model. `none` means the model
-        will not call a function and instead generates a message. `auto` means the model
-        can pick between generating a message or calling a function. Specifying a
-        particular function via
-        `{"type: "function", "function": {"name": "my_function"}}` forces the model to
-        call that function.
-
-        `none` is the default when no functions are present. `auto` is the default if
-        functions are present.
-    - tools: A list of tools the model may call. Currently, only functions are supported as a
-        tool. Use this to provide a list of functions the model may generate JSON inputs
-        for.
-    - top_logprobs: An integer between 0 and 5 specifying the number of most likely tokens to return
-        at each token position, each with an associated log probability. `logprobs` must
-        be set to `true` if this parameter is used.
-    - top_p: An alternative to sampling with temperature, called nucleus sampling, where the
-        model considers the results of the tokens with top_p probability mass. So 0.1
-        means only the tokens comprising the top 10% probability mass are considered.
-
-        We generally recommend altering this or `temperature` but not both.
-    - user: A unique identifier representing your end-user, which can help OpenAI to monitor
-        and detect abuse.
-        [Learn more](https://platform.openai.com/docs/guides/safety-best-practices/end-user-ids).
-    - extra_headers: Send extra headers
-    - extra_query: Add additional query parameters to the request
-    - extra_body: Add additional JSON properties to the request
-    - timeout: Override the client-level default timeout for this request, in seconds
+    Args:
+        - capacity: Maximum number of concurrent operations allowed.
+            Defaults to None, indicating no specific limit.
+        - retry_strategy: Strategy for handling retries in case of failures.
+            Defaults to None.
+        - cache_strategy: Defines the caching mechanism. If set to None and a persistency
+            is enabled, operations will be cached using the persistence layer.
+            Defaults to None.
+        - model: ID of the model to use. See the
+            `model endpoint compatibility <https://platform.openai.com/docs/models/model-endpoint-compatibility>`_
+            table for details on which models work with the Chat API.
+        - frequency_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on their
+            existing frequency in the text so far, decreasing the model's likelihood to
+            repeat the same line verbatim.
+
+            `See more information about frequency and presence penalties.
+            <https://platform.openai.com/docs/guides/text-generation/parameter-details>`_
+        - function_call: Deprecated in favor of `tool_choice`.
+
+            Controls which (if any) function is called by the model. `none` means the model
+            will not call a function and instead generates a message. `auto` means the model
+            can pick between generating a message or calling a function. Specifying a
+            particular function via `{"name": "my_function"}` forces the model to call that
+            function.
+
+            `none` is the default when no functions are present. `auto` is the default if
+            functions are present.
+        - functions: Deprecated in favor of `tools`.
+
+            A list of functions the model may generate JSON inputs for.
+        - logit_bias: Modify the likelihood of specified tokens appearing in the completion.
+
+            Accepts a JSON object that maps tokens (specified by their token ID in the
+            tokenizer) to an associated bias value from -100 to 100. Mathematically, the
+            bias is added to the logits generated by the model prior to sampling. The exact
+            effect will vary per model, but values between -1 and 1 should decrease or
+            increase likelihood of selection; values like -100 or 100 should result in a ban
+            or exclusive selection of the relevant token.
+        - logprobs: Whether to return log probabilities of the output tokens or not. If true,
+            returns the log probabilities of each output token returned in the `content` of
+            `message`. This option is currently not available on the `gpt-4-vision-preview`
+            model.
+        - max_tokens: The maximum number of [tokens](/tokenizer) that can be generated in the chat
+            completion.
+
+            The total length of input tokens and generated tokens is limited by the model's
+            context length.
+            `Example Python code <https://cookbook.openai.com/examples/how_to_count_tokens_with_tiktoken>`_
+            for counting tokens.
+        - n: How many chat completion choices to generate for each input message. Note that
+            you will be charged based on the number of generated tokens across all of the
+            choices. Keep `n` as `1` to minimize costs.
+        - presence_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on
+            whether they appear in the text so far, increasing the model's likelihood to
+            talk about new topics.
+
+            `See more information about frequency and presence penalties.
+            <https://platform.openai.com/docs/guides/text-generation/parameter-details>`_
+        - response_format: An object specifying the format that the model must output. Compatible with
+            `gpt-4-1106-preview` and `gpt-3.5-turbo-1106`.
+
+            Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the
+            message the model generates is valid JSON.
+
+            **Important:** when using JSON mode, you **must** also instruct the model to
+            produce JSON yourself via a system or user message. Without this, the model may
+            generate an unending stream of whitespace until the generation reaches the token
+            limit, resulting in a long-running and seemingly "stuck" request. Also note that
+            the message content may be partially cut off if `finish_reason="length"`, which
+            indicates the generation exceeded `max_tokens` or the conversation exceeded the
+            max context length.
+        - seed: This feature is in Beta. If specified, our system will make a best effort to
+            sample deterministically, such that repeated requests with the same `seed` and
+            parameters should return the same result. Determinism is not guaranteed, and you
+            should refer to the `system_fingerprint` response parameter to monitor changes
+            in the backend.
+        - stop: Up to 4 sequences where the API will stop generating further tokens.
+        - stream: If set, partial message deltas will be sent, like in ChatGPT. Tokens will be
+            sent as data-only
+            `server-sent events
+            <https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format>`_
+            as they become available, with the stream terminated by a `data: [DONE]`
+            message.
+            `Example Python code <https://cookbook.openai.com/examples/how_to_stream_completions>`_.
+        - temperature: What sampling temperature to use, between 0 and 2. Higher values like 0.8 will
+            make the output more random, while lower values like 0.2 will make it more
+            focused and deterministic.
+
+            We generally recommend altering this or `top_p` but not both.
+        - tool_choice: Controls which (if any) function is called by the model. `none` means the model
+            will not call a function and instead generates a message. `auto` means the model
+            can pick between generating a message or calling a function. Specifying a
+            particular function via
+            `{"type: "function", "function": {"name": "my_function"}}` forces the model to
+            call that function.
+
+            `none` is the default when no functions are present. `auto` is the default if
+            functions are present.
+        - tools: A list of tools the model may call. Currently, only functions are supported as a
+            tool. Use this to provide a list of functions the model may generate JSON inputs
+            for.
+        - top_logprobs: An integer between 0 and 5 specifying the number of most likely tokens to return
+            at each token position, each with an associated log probability. `logprobs` must
+            be set to `true` if this parameter is used.
+        - top_p: An alternative to sampling with temperature, called nucleus sampling, where the
+            model considers the results of the tokens with top_p probability mass. So 0.1
+            means only the tokens comprising the top 10% probability mass are considered.
+
+            We generally recommend altering this or `temperature` but not both.
+        - user: A unique identifier representing your end-user, which can help OpenAI to monitor
+            and detect abuse.
+            `Learn more <https://platform.openai.com/docs/guides/safety-best-practices/end-user-ids>`_.
+        - extra_headers: Send extra headers
+        - extra_query: Add additional query parameters to the request
+        - extra_body: Add additional JSON properties to the request
+        - timeout: Override the client-level default timeout for this request, in seconds
 
 
     Any arguments can be provided either to the constructor or in the UDF call.
     To specify the `model` in the UDF call, set it to None in the constructor.
 
-    Examples:
+    Example:
+
     >>> import pathway as pw
     >>> from pathway.xpacks.llm import llms
     >>> from pathway.internals.asynchronous import ExponentialBackoffRetryStrategy
@@ -192,30 +195,31 @@ class LiteLLMChat(pw.UDFAsync):
     construction. All other arguments can be overridden during application.
 
     Args:
-    - capacity: Maximum number of concurrent operations allowed.
-        Defaults to None, indicating no specific limit.
-    - retry_strategy: Strategy for handling retries in case of failures.
-        Defaults to None.
-    - cache_strategy: Defines the caching mechanism. If set to None and a persistency
-        is enabled, operations will be cached using the persistence layer.
-        Defaults to None.
-    - model: ID of the model to use. Check the
-      [providers supported by LiteLLM](https://docs.litellm.ai/docs/providers)
-      for details on which models work with the LiteLLM API.
-    - api_base: API endpoint to be used for the call.
-    - api_version: API version to be used for the call. Only for Azure models.
-    - num_retries: The number of retries if the API call fails.
-    - context_window_fallback_dict: Mapping of fallback models to be used in case of context window error
-    - fallbacks: List of fallback models to be used if the initial call fails
-    - metadata: Additional data to be logged when the call is made.
+        - capacity: Maximum number of concurrent operations allowed.
+            Defaults to None, indicating no specific limit.
+        - retry_strategy: Strategy for handling retries in case of failures.
+            Defaults to None.
+        - cache_strategy: Defines the caching mechanism. If set to None and a persistency
+            is enabled, operations will be cached using the persistence layer.
+            Defaults to None.
+        - model: ID of the model to use. Check the
+            `providers supported by LiteLLM <https://docs.litellm.ai/docs/providers>`_
+            for details on which models work with the LiteLLM API.
+        - api_base: API endpoint to be used for the call.
+        - api_version: API version to be used for the call. Only for Azure models.
+        - num_retries: The number of retries if the API call fails.
+        - context_window_fallback_dict: Mapping of fallback models to be used in case of context window error
+        - fallbacks: List of fallback models to be used if the initial call fails
+        - metadata: Additional data to be logged when the call is made.
 
     For more information on provider specific arguments check the
-    [LiteLLM documentation](https://docs.litellm.ai/docs/completion/input).
+    `LiteLLM documentation <https://docs.litellm.ai/docs/completion/input>`_.
 
     Any arguments can be provided either to the constructor or in the UDF call.
     To specify the `model` in the UDF call, set it to None in the constructor.
 
-    Examples:
+    Example:
+
     >>> import pathway as pw
     >>> from pathway.xpacks.llm import llms
     >>> from pathway.internals.asynchronous import ExponentialBackoffRetryStrategy
@@ -265,14 +269,29 @@ class HFPipelineChat(pw.UDFSync):
         model: ID of the model to be used
         call_kwargs: kwargs that will be passed to each call of HuggingFace Pipeline.
             These can be overridden during each application. For possible arguments check
-            [the HuggingFace documentation](https://huggingface.co/docs/transformers/en/main_classes/pipelines#transformers.TextGenerationPipeline.__call__).
+            `the HuggingFace documentation
+            <https://huggingface.co/docs/transformers/en/main_classes/pipelines#transformers.TextGenerationPipeline.__call__>`_.
         device: defines which device will be used to run the Pipeline
         pipeline_kwargs: kwargs accepted during initialization of HuggingFace Pipeline.
             For possible arguments check
-            [the HuggingFace documentation](https://huggingface.co/docs/transformers/en/main_classes/pipelines#transformers.pipeline).
+            `the HuggingFace documentation <https://huggingface.co/docs/transformers/en/main_classes/pipelines#transformers.pipeline>`_.
 
     `call_kwargs` can be overridden during application, all other arguments need
     to be specified during class construction.
+
+    Example:
+
+    >>> import pathway as pw
+    >>> from pathway.xpacks.llm import llms
+    >>> from pathway.internals.asynchronous import ExponentialBackoffRetryStrategy
+    >>> chat = llms.HFPipelineChat(model="gpt2", retry_strategy=ExponentialBackoffRetryStrategy(max_retries=6))
+    >>> t = pw.debug.table_from_markdown('''
+    ... txt
+    ... Wazzup?
+    ... ''')
+    >>> r = t.select(ret=chat(llms.prompt_chat_single_qa(t.txt)))
+    >>> r
+    <pathway.Table schema={'ret': str | None}>
     """  # noqa: E501
 
     def __init__(
@@ -324,5 +343,24 @@ def wrapped(input_string: str) -> str:
 
 @pw.udf
 def prompt_chat_single_qa(question: str) -> pw.Json:
-    """Create chat prompt messages for single question answering."""
+    """
+    Create chat prompt messages for single question answering. A string with a question
+    is converted into one-element list with a dictionary with keys `role` and `content`.
+
+    Args:
+        question: a column with questions to be transformed into prompts
+
+    Example:
+
+    >>> import pathway as pw
+    >>> from pathway.xpacks.llm import llms
+    >>> t = pw.debug.table_from_markdown('''
+    ... txt
+    ... Wazzup?
+    ... ''')
+    >>> r = t.select(prompt=llms.prompt_chat_single_qa(t.txt))
+    >>> pw.debug.compute_and_print(r, include_id=False)
+    prompt
+    [{"content": "Wazzup?", "role": "system"}]
+    """
     return pw.Json([dict(role="system", content=question)])
diff --git a/python/pathway/xpacks/llm/parsers.py b/python/pathway/xpacks/llm/parsers.py
index 57ba6940..675ca2b3 100644
--- a/python/pathway/xpacks/llm/parsers.py
+++ b/python/pathway/xpacks/llm/parsers.py
@@ -28,7 +28,7 @@ def __wrapped__(self, contents: bytes) -> list[tuple[str, dict]]:
 # MIT licensed
 class ParseUnstructured(pw.UDFSync):
     """
-    Parse document using https://unstructured.io/.
+    Parse document using `https://unstructured.io/ <https://unstructured.io/>`_.
 
     All arguments can be overridden during UDF application.
 
diff --git a/python/pathway/xpacks/llm/splitters.py b/python/pathway/xpacks/llm/splitters.py
index 80d3ddc4..4da2dde0 100644
--- a/python/pathway/xpacks/llm/splitters.py
+++ b/python/pathway/xpacks/llm/splitters.py
@@ -42,24 +42,24 @@ class TokenCountSplitter(pw.UDFSync):
 
     All arguments set default which may be overridden in the UDF call
 
-    Arguments:
+    Args:
         min_tokens: minimum tokens in a chunk of text.
         max_tokens: maximum size of a chunk in tokens.
         encoding_name: name of the encoding from `tiktoken`.
 
     Example:
 
-    # >>> from pathway.xpacks.llm.splitters import TokenCountSplitter
-    # >>> import pathway as pw
-    # >>> t  = pw.debug.table_from_markdown(
-    # ...     '''| text
-    # ... 1| cooltext'''
-    # ... )
-    # >>> splitter = TokenCountSplitter(min_tokens=1, max_tokens=1)
-    # >>> t += t.select(chunks = pw.apply(splitter, pw.this.text))
-    # >>> pw.debug.compute_and_print(t, include_id=False)
-    # text     | chunks
-    # cooltext | (('cool', pw.Json({})), ('text', pw.Json({})))
+    >>> from pathway.xpacks.llm.splitters import TokenCountSplitter
+    >>> import pathway as pw
+    >>> t  = pw.debug.table_from_markdown(
+    ...     '''| text
+    ... 1| cooltext'''
+    ... )
+    >>> splitter = TokenCountSplitter(min_tokens=1, max_tokens=1)
+    >>> t += t.select(chunks = splitter(pw.this.text))
+    >>> pw.debug.compute_and_print(t, include_id=False)
+    text     | chunks
+    cooltext | (('cool', pw.Json({})), ('text', pw.Json({})))
     """
 
     CHARS_PER_TOKEN = 3
diff --git a/python/pathway/xpacks/llm/vector_store.py b/python/pathway/xpacks/llm/vector_store.py
index e1976458..10f9f63f 100644
--- a/python/pathway/xpacks/llm/vector_store.py
+++ b/python/pathway/xpacks/llm/vector_store.py
@@ -440,12 +440,31 @@ def run():
 
 
 class VectorStoreClient:
+    """
+    A client you can use to query :py:class:`VectorStoreServer`.
+
+    Args:
+        - host: host on which `:py:class:`VectorStoreServer` listens
+        - port: port on which `:py:class:`VectorStoreServer` listens
+    """
+
     def __init__(self, host, port):
         self.host = host
         self.port = port
 
-    def query(self, query, k=3, metadata_filter=None) -> list[dict]:
-        """Perform a query to the vector store and fetch results."""
+    def query(
+        self, query: str, k: int = 3, metadata_filter: str | None = None
+    ) -> list[dict]:
+        """
+        Perform a query to the vector store and fetch results.
+
+        Args:
+            - query:
+            - k: number of documents to be returned
+            - metadata_filter: optional string representing the metadata filtering query
+                in the JMESPath format. The search will happen only for documents
+                satisfying this filtering.
+        """
 
         data = {"query": query, "k": k}
         if metadata_filter is not None:
@@ -474,8 +493,21 @@ def get_vectorstore_statistics(self):
         responses = response.json()
         return responses
 
-    def get_input_files(self, metadata_filter=None, filepath_globpattern=None):
-        """Fetch basic statistics about the vector store."""
+    def get_input_files(
+        self,
+        metadata_filter: str | None = None,
+        filepath_globpattern: str | None = None,
+    ):
+        """
+        Fetch information on documents in the the vector store.
+
+        Args:
+            metadata_filter: optional string representing the metadata filtering query
+                in the JMESPath format. The search will happen only for documents
+                satisfying this filtering.
+            filepath_globpattern: optional glob pattern specifying which documents
+                will be searched for this query.
+        """
         url = f"http://{self.host}:{self.port}/v1/inputs"
         response = requests.post(
             url,