From 2d76cebd4f412d52cc250b11fdf132aefcf74f61 Mon Sep 17 00:00:00 2001 From: szymon Date: Tue, 20 Feb 2024 11:29:07 +0100 Subject: [PATCH] User guide for LLM xpack (#5628) GitOrigin-RevId: 6c531c20328928ab9cbb9e52bb1d10b131d50400 --- pyproject.toml | 2 + python/pathway/xpacks/llm/embedders.py | 111 ++++---- python/pathway/xpacks/llm/llms.py | 316 ++++++++++++---------- python/pathway/xpacks/llm/parsers.py | 2 +- python/pathway/xpacks/llm/splitters.py | 24 +- python/pathway/xpacks/llm/vector_store.py | 40 ++- 6 files changed, 292 insertions(+), 203 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index addece0e..62ff8187 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,6 +55,8 @@ tests = [ "litellm >= 1.0", "llama_index >= 0.9, < 0.10", "tiktoken >= 0.5", + "sentence_transformers", + "transformers" ] [project.urls] diff --git a/python/pathway/xpacks/llm/embedders.py b/python/pathway/xpacks/llm/embedders.py index 7c9cfafa..45c938e9 100644 --- a/python/pathway/xpacks/llm/embedders.py +++ b/python/pathway/xpacks/llm/embedders.py @@ -16,33 +16,34 @@ class OpenAIEmbedder(pw.UDFAsync): The capacity, retry_strategy and cache_strategy need to be specified during object construction. All other arguments can be overridden during application. - Parameters: - - capacity: Maximum number of concurrent operations allowed. - Defaults to None, indicating no specific limit. - - retry_strategy: Strategy for handling retries in case of failures. - Defaults to None. - - cache_strategy: Defines the caching mechanism. If set to None and a persistency - is enabled, operations will be cached using the persistence layer. - Defaults to None. - - model: ID of the model to use. You can use the - [List models](https://platform.openai.com/docs/api-reference/models/list) API to - see all of your available models, or see - [Model overview](https://platform.openai.com/docs/models/overview) for - descriptions of them. - - encoding_format: The format to return the embeddings in. Can be either `float` or - [`base64`](https://pypi.org/project/pybase64/). - - user: A unique identifier representing your end-user, which can help OpenAI to monitor - and detect abuse. - [Learn more](https://platform.openai.com/docs/guides/safety-best-practices/end-user-ids). - - extra_headers: Send extra headers - - extra_query: Add additional query parameters to the request - - extra_body: Add additional JSON properties to the request - - timeout: Timeout for requests, in seconds + Args: + - capacity: Maximum number of concurrent operations allowed. + Defaults to None, indicating no specific limit. + - retry_strategy: Strategy for handling retries in case of failures. + Defaults to None. + - cache_strategy: Defines the caching mechanism. If set to None and a persistency + is enabled, operations will be cached using the persistence layer. + Defaults to None. + - model: ID of the model to use. You can use the + `List models `_ API to + see all of your available models, or see + `Model overview `_ for + descriptions of them. + - encoding_format: The format to return the embeddings in. Can be either `float` or + `base64 `_. + - user: A unique identifier representing your end-user, which can help OpenAI to monitor + and detect abuse. + `Learn more `_. + - extra_headers: Send extra headers + - extra_query: Add additional query parameters to the request + - extra_body: Add additional JSON properties to the request + - timeout: Timeout for requests, in seconds Any arguments can be provided either to the constructor or in the UDF call. To specify the `model` in the UDF call, set it to None. - Examples: + Example: + >>> import pathway as pw >>> from pathway.xpacks.llm import embedders >>> embedder = embedders.OpenAIEmbedder(model="text-embedding-ada-002") @@ -85,7 +86,7 @@ def __init__( async def __wrapped__(self, input, **kwargs) -> list[float]: """Embed the documents - Parameters: + Args: - input: mandatory, the string to embed. - **kwargs: optional parameters, if unset defaults from the constructor will be taken. @@ -104,29 +105,30 @@ class LiteLLMEmbedder(pw.UDFAsync): is provided. The capacity, retry_strategy and cache_strategy need to be specified during object construction. All other arguments can be overridden during application. - Parameters: - - capacity: Maximum number of concurrent operations allowed. - Defaults to None, indicating no specific limit. - - retry_strategy: Strategy for handling retries in case of failures. - Defaults to None. - - cache_strategy: Defines the caching mechanism. If set to None and a persistency - is enabled, operations will be cached using the persistence layer. - Defaults to None. - - model: The embedding model to use. - - timeout: The timeout value for the API call, default 10 mins - - litellm_call_id: The call ID for litellm logging. - - litellm_logging_obj: The litellm logging object. - - logger_fn: The logger function. - - api_base: Optional. The base URL for the API. - - api_version: Optional. The version of the API. - - api_key: Optional. The API key to use. - - api_type: Optional. The type of the API. - - custom_llm_provider: The custom llm provider. + Args: + - capacity: Maximum number of concurrent operations allowed. + Defaults to None, indicating no specific limit. + - retry_strategy: Strategy for handling retries in case of failures. + Defaults to None. + - cache_strategy: Defines the caching mechanism. If set to None and a persistency + is enabled, operations will be cached using the persistence layer. + Defaults to None. + - model: The embedding model to use. + - timeout: The timeout value for the API call, default 10 mins + - litellm_call_id: The call ID for litellm logging. + - litellm_logging_obj: The litellm logging object. + - logger_fn: The logger function. + - api_base: Optional. The base URL for the API. + - api_version: Optional. The version of the API. + - api_key: Optional. The API key to use. + - api_type: Optional. The type of the API. + - custom_llm_provider: The custom llm provider. Any arguments can be provided either to the constructor or in the UDF call. To specify the `model` in the UDF call, set it to None. - Examples: + Example: + >>> import pathway as pw >>> from pathway.xpacks.llm import embedders >>> embedder = embedders.LiteLLMEmbedder(model="text-embedding-ada-002") @@ -169,7 +171,7 @@ def __init__( async def __wrapped__(self, input, **kwargs) -> list[float]: """Embed the documents - Parameters: + Args: - input: mandatory, the string to embed. - **kwargs: optional parameters, if unset defaults from the constructor will be taken. @@ -187,11 +189,25 @@ class SentenceTransformerEmbedder(pw.UDFSync): model: model name or path call_kwargs: kwargs that will be passed to each call of encode. These can be overridden during each application. For possible arguments check - [the Sentence-Transformers documentation](https://www.sbert.net/docs/package_reference/SentenceTransformer.html#sentence_transformers.SentenceTransformer.encode). + `the Sentence-Transformers documentation + `_. device: defines which device will be used to run the Pipeline sentencetransformer_kwargs: kwargs accepted during initialization of SentenceTransformers. For possible arguments check - [the Sentence-Transformers documentation](https://www.sbert.net/docs/package_reference/SentenceTransformer.html#sentence_transformers.SentenceTransformer) + `the Sentence-Transformers documentation + `_ + + Example: + + >>> import pathway as pw + >>> from pathway.xpacks.llm import embedders + >>> embedder = embedders.SentenceTransformerEmbedder(model="intfloat/e5-large-v2") + >>> t = pw.debug.table_from_markdown(''' + ... txt + ... Text + ... ''') + >>> t.select(ret=embedder(pw.this.txt)) + """ # noqa: E501 def __init__( @@ -222,7 +238,8 @@ def __wrapped__(self, text: str, **kwargs) -> list[float]: - input: mandatory, the string to embed. - **kwargs: optional parameters for `encode` method. If unset defaults from the constructor will be taken. For possible arguments check - [the Sentence-Transformers documentation](https://www.sbert.net/docs/package_reference/SentenceTransformer.html#sentence_transformers.SentenceTransformer.encode). + `the Sentence-Transformers documentation + `_. """ # noqa: E501 kwargs = {**self.kwargs, **kwargs} return self.model.encode(text, **kwargs).tolist() diff --git a/python/pathway/xpacks/llm/llms.py b/python/pathway/xpacks/llm/llms.py index b70fa04d..7b981686 100644 --- a/python/pathway/xpacks/llm/llms.py +++ b/python/pathway/xpacks/llm/llms.py @@ -1,6 +1,8 @@ # Copyright © 2024 Pathway """ -UDF for calling LLMs: +Pathway UDFs for calling LLMs + +This momdule contains UDFs for calling LLMs chat services: 1. wrappers over LLM APIs 2. prompt building tools """ @@ -18,128 +20,129 @@ class OpenAIChat(pw.UDFAsync): The capacity, retry_strategy and cache_strategy need to be specified during object construction. All other arguments can be overridden during application. - Parameters: - - capacity: Maximum number of concurrent operations allowed. - Defaults to None, indicating no specific limit. - - retry_strategy: Strategy for handling retries in case of failures. - Defaults to None. - - cache_strategy: Defines the caching mechanism. If set to None and a persistency - is enabled, operations will be cached using the persistence layer. - Defaults to None. - - model: ID of the model to use. See the - [model endpoint compatibility](https://platform.openai.com/docs/models/model-endpoint-compatibility) - table for details on which models work with the Chat API. - - frequency_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on their - existing frequency in the text so far, decreasing the model's likelihood to - repeat the same line verbatim. - - [See more information about frequency and presence penalties.]( - https://platform.openai.com/docs/guides/text-generation/parameter-details) - - function_call: Deprecated in favor of `tool_choice`. - - Controls which (if any) function is called by the model. `none` means the model - will not call a function and instead generates a message. `auto` means the model - can pick between generating a message or calling a function. Specifying a - particular function via `{"name": "my_function"}` forces the model to call that - function. - - `none` is the default when no functions are present. `auto` is the default if - functions are present. - - functions: Deprecated in favor of `tools`. - - A list of functions the model may generate JSON inputs for. - - logit_bias: Modify the likelihood of specified tokens appearing in the completion. - - Accepts a JSON object that maps tokens (specified by their token ID in the - tokenizer) to an associated bias value from -100 to 100. Mathematically, the - bias is added to the logits generated by the model prior to sampling. The exact - effect will vary per model, but values between -1 and 1 should decrease or - increase likelihood of selection; values like -100 or 100 should result in a ban - or exclusive selection of the relevant token. - - logprobs: Whether to return log probabilities of the output tokens or not. If true, - returns the log probabilities of each output token returned in the `content` of - `message`. This option is currently not available on the `gpt-4-vision-preview` - model. - - max_tokens: The maximum number of [tokens](/tokenizer) that can be generated in the chat - completion. - - The total length of input tokens and generated tokens is limited by the model's - context length. - [Example Python code](https://cookbook.openai.com/examples/how_to_count_tokens_with_tiktoken) - for counting tokens. - - n: How many chat completion choices to generate for each input message. Note that - you will be charged based on the number of generated tokens across all of the - choices. Keep `n` as `1` to minimize costs. - - presence_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on - whether they appear in the text so far, increasing the model's likelihood to - talk about new topics. - - [See more information about frequency and presence penalties.]( - https://platform.openai.com/docs/guides/text-generation/parameter-details) - - response_format: An object specifying the format that the model must output. Compatible with - `gpt-4-1106-preview` and `gpt-3.5-turbo-1106`. - - Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the - message the model generates is valid JSON. - - **Important:** when using JSON mode, you **must** also instruct the model to - produce JSON yourself via a system or user message. Without this, the model may - generate an unending stream of whitespace until the generation reaches the token - limit, resulting in a long-running and seemingly "stuck" request. Also note that - the message content may be partially cut off if `finish_reason="length"`, which - indicates the generation exceeded `max_tokens` or the conversation exceeded the - max context length. - - seed: This feature is in Beta. If specified, our system will make a best effort to - sample deterministically, such that repeated requests with the same `seed` and - parameters should return the same result. Determinism is not guaranteed, and you - should refer to the `system_fingerprint` response parameter to monitor changes - in the backend. - - stop: Up to 4 sequences where the API will stop generating further tokens. - - stream: If set, partial message deltas will be sent, like in ChatGPT. Tokens will be - sent as data-only - [server-sent events]( - https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format) - as they become available, with the stream terminated by a `data: [DONE]` - message. - [Example Python code](https://cookbook.openai.com/examples/how_to_stream_completions). - - temperature: What sampling temperature to use, between 0 and 2. Higher values like 0.8 will - make the output more random, while lower values like 0.2 will make it more - focused and deterministic. - - We generally recommend altering this or `top_p` but not both. - - tool_choice: Controls which (if any) function is called by the model. `none` means the model - will not call a function and instead generates a message. `auto` means the model - can pick between generating a message or calling a function. Specifying a - particular function via - `{"type: "function", "function": {"name": "my_function"}}` forces the model to - call that function. - - `none` is the default when no functions are present. `auto` is the default if - functions are present. - - tools: A list of tools the model may call. Currently, only functions are supported as a - tool. Use this to provide a list of functions the model may generate JSON inputs - for. - - top_logprobs: An integer between 0 and 5 specifying the number of most likely tokens to return - at each token position, each with an associated log probability. `logprobs` must - be set to `true` if this parameter is used. - - top_p: An alternative to sampling with temperature, called nucleus sampling, where the - model considers the results of the tokens with top_p probability mass. So 0.1 - means only the tokens comprising the top 10% probability mass are considered. - - We generally recommend altering this or `temperature` but not both. - - user: A unique identifier representing your end-user, which can help OpenAI to monitor - and detect abuse. - [Learn more](https://platform.openai.com/docs/guides/safety-best-practices/end-user-ids). - - extra_headers: Send extra headers - - extra_query: Add additional query parameters to the request - - extra_body: Add additional JSON properties to the request - - timeout: Override the client-level default timeout for this request, in seconds + Args: + - capacity: Maximum number of concurrent operations allowed. + Defaults to None, indicating no specific limit. + - retry_strategy: Strategy for handling retries in case of failures. + Defaults to None. + - cache_strategy: Defines the caching mechanism. If set to None and a persistency + is enabled, operations will be cached using the persistence layer. + Defaults to None. + - model: ID of the model to use. See the + `model endpoint compatibility `_ + table for details on which models work with the Chat API. + - frequency_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on their + existing frequency in the text so far, decreasing the model's likelihood to + repeat the same line verbatim. + + `See more information about frequency and presence penalties. + `_ + - function_call: Deprecated in favor of `tool_choice`. + + Controls which (if any) function is called by the model. `none` means the model + will not call a function and instead generates a message. `auto` means the model + can pick between generating a message or calling a function. Specifying a + particular function via `{"name": "my_function"}` forces the model to call that + function. + + `none` is the default when no functions are present. `auto` is the default if + functions are present. + - functions: Deprecated in favor of `tools`. + + A list of functions the model may generate JSON inputs for. + - logit_bias: Modify the likelihood of specified tokens appearing in the completion. + + Accepts a JSON object that maps tokens (specified by their token ID in the + tokenizer) to an associated bias value from -100 to 100. Mathematically, the + bias is added to the logits generated by the model prior to sampling. The exact + effect will vary per model, but values between -1 and 1 should decrease or + increase likelihood of selection; values like -100 or 100 should result in a ban + or exclusive selection of the relevant token. + - logprobs: Whether to return log probabilities of the output tokens or not. If true, + returns the log probabilities of each output token returned in the `content` of + `message`. This option is currently not available on the `gpt-4-vision-preview` + model. + - max_tokens: The maximum number of [tokens](/tokenizer) that can be generated in the chat + completion. + + The total length of input tokens and generated tokens is limited by the model's + context length. + `Example Python code `_ + for counting tokens. + - n: How many chat completion choices to generate for each input message. Note that + you will be charged based on the number of generated tokens across all of the + choices. Keep `n` as `1` to minimize costs. + - presence_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on + whether they appear in the text so far, increasing the model's likelihood to + talk about new topics. + + `See more information about frequency and presence penalties. + `_ + - response_format: An object specifying the format that the model must output. Compatible with + `gpt-4-1106-preview` and `gpt-3.5-turbo-1106`. + + Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the + message the model generates is valid JSON. + + **Important:** when using JSON mode, you **must** also instruct the model to + produce JSON yourself via a system or user message. Without this, the model may + generate an unending stream of whitespace until the generation reaches the token + limit, resulting in a long-running and seemingly "stuck" request. Also note that + the message content may be partially cut off if `finish_reason="length"`, which + indicates the generation exceeded `max_tokens` or the conversation exceeded the + max context length. + - seed: This feature is in Beta. If specified, our system will make a best effort to + sample deterministically, such that repeated requests with the same `seed` and + parameters should return the same result. Determinism is not guaranteed, and you + should refer to the `system_fingerprint` response parameter to monitor changes + in the backend. + - stop: Up to 4 sequences where the API will stop generating further tokens. + - stream: If set, partial message deltas will be sent, like in ChatGPT. Tokens will be + sent as data-only + `server-sent events + `_ + as they become available, with the stream terminated by a `data: [DONE]` + message. + `Example Python code `_. + - temperature: What sampling temperature to use, between 0 and 2. Higher values like 0.8 will + make the output more random, while lower values like 0.2 will make it more + focused and deterministic. + + We generally recommend altering this or `top_p` but not both. + - tool_choice: Controls which (if any) function is called by the model. `none` means the model + will not call a function and instead generates a message. `auto` means the model + can pick between generating a message or calling a function. Specifying a + particular function via + `{"type: "function", "function": {"name": "my_function"}}` forces the model to + call that function. + + `none` is the default when no functions are present. `auto` is the default if + functions are present. + - tools: A list of tools the model may call. Currently, only functions are supported as a + tool. Use this to provide a list of functions the model may generate JSON inputs + for. + - top_logprobs: An integer between 0 and 5 specifying the number of most likely tokens to return + at each token position, each with an associated log probability. `logprobs` must + be set to `true` if this parameter is used. + - top_p: An alternative to sampling with temperature, called nucleus sampling, where the + model considers the results of the tokens with top_p probability mass. So 0.1 + means only the tokens comprising the top 10% probability mass are considered. + + We generally recommend altering this or `temperature` but not both. + - user: A unique identifier representing your end-user, which can help OpenAI to monitor + and detect abuse. + `Learn more `_. + - extra_headers: Send extra headers + - extra_query: Add additional query parameters to the request + - extra_body: Add additional JSON properties to the request + - timeout: Override the client-level default timeout for this request, in seconds Any arguments can be provided either to the constructor or in the UDF call. To specify the `model` in the UDF call, set it to None in the constructor. - Examples: + Example: + >>> import pathway as pw >>> from pathway.xpacks.llm import llms >>> from pathway.internals.asynchronous import ExponentialBackoffRetryStrategy @@ -192,30 +195,31 @@ class LiteLLMChat(pw.UDFAsync): construction. All other arguments can be overridden during application. Args: - - capacity: Maximum number of concurrent operations allowed. - Defaults to None, indicating no specific limit. - - retry_strategy: Strategy for handling retries in case of failures. - Defaults to None. - - cache_strategy: Defines the caching mechanism. If set to None and a persistency - is enabled, operations will be cached using the persistence layer. - Defaults to None. - - model: ID of the model to use. Check the - [providers supported by LiteLLM](https://docs.litellm.ai/docs/providers) - for details on which models work with the LiteLLM API. - - api_base: API endpoint to be used for the call. - - api_version: API version to be used for the call. Only for Azure models. - - num_retries: The number of retries if the API call fails. - - context_window_fallback_dict: Mapping of fallback models to be used in case of context window error - - fallbacks: List of fallback models to be used if the initial call fails - - metadata: Additional data to be logged when the call is made. + - capacity: Maximum number of concurrent operations allowed. + Defaults to None, indicating no specific limit. + - retry_strategy: Strategy for handling retries in case of failures. + Defaults to None. + - cache_strategy: Defines the caching mechanism. If set to None and a persistency + is enabled, operations will be cached using the persistence layer. + Defaults to None. + - model: ID of the model to use. Check the + `providers supported by LiteLLM `_ + for details on which models work with the LiteLLM API. + - api_base: API endpoint to be used for the call. + - api_version: API version to be used for the call. Only for Azure models. + - num_retries: The number of retries if the API call fails. + - context_window_fallback_dict: Mapping of fallback models to be used in case of context window error + - fallbacks: List of fallback models to be used if the initial call fails + - metadata: Additional data to be logged when the call is made. For more information on provider specific arguments check the - [LiteLLM documentation](https://docs.litellm.ai/docs/completion/input). + `LiteLLM documentation `_. Any arguments can be provided either to the constructor or in the UDF call. To specify the `model` in the UDF call, set it to None in the constructor. - Examples: + Example: + >>> import pathway as pw >>> from pathway.xpacks.llm import llms >>> from pathway.internals.asynchronous import ExponentialBackoffRetryStrategy @@ -265,14 +269,29 @@ class HFPipelineChat(pw.UDFSync): model: ID of the model to be used call_kwargs: kwargs that will be passed to each call of HuggingFace Pipeline. These can be overridden during each application. For possible arguments check - [the HuggingFace documentation](https://huggingface.co/docs/transformers/en/main_classes/pipelines#transformers.TextGenerationPipeline.__call__). + `the HuggingFace documentation + `_. device: defines which device will be used to run the Pipeline pipeline_kwargs: kwargs accepted during initialization of HuggingFace Pipeline. For possible arguments check - [the HuggingFace documentation](https://huggingface.co/docs/transformers/en/main_classes/pipelines#transformers.pipeline). + `the HuggingFace documentation `_. `call_kwargs` can be overridden during application, all other arguments need to be specified during class construction. + + Example: + + >>> import pathway as pw + >>> from pathway.xpacks.llm import llms + >>> from pathway.internals.asynchronous import ExponentialBackoffRetryStrategy + >>> chat = llms.HFPipelineChat(model="gpt2", retry_strategy=ExponentialBackoffRetryStrategy(max_retries=6)) + >>> t = pw.debug.table_from_markdown(''' + ... txt + ... Wazzup? + ... ''') + >>> r = t.select(ret=chat(llms.prompt_chat_single_qa(t.txt))) + >>> r + """ # noqa: E501 def __init__( @@ -324,5 +343,24 @@ def wrapped(input_string: str) -> str: @pw.udf def prompt_chat_single_qa(question: str) -> pw.Json: - """Create chat prompt messages for single question answering.""" + """ + Create chat prompt messages for single question answering. A string with a question + is converted into one-element list with a dictionary with keys `role` and `content`. + + Args: + question: a column with questions to be transformed into prompts + + Example: + + >>> import pathway as pw + >>> from pathway.xpacks.llm import llms + >>> t = pw.debug.table_from_markdown(''' + ... txt + ... Wazzup? + ... ''') + >>> r = t.select(prompt=llms.prompt_chat_single_qa(t.txt)) + >>> pw.debug.compute_and_print(r, include_id=False) + prompt + [{"content": "Wazzup?", "role": "system"}] + """ return pw.Json([dict(role="system", content=question)]) diff --git a/python/pathway/xpacks/llm/parsers.py b/python/pathway/xpacks/llm/parsers.py index 57ba6940..675ca2b3 100644 --- a/python/pathway/xpacks/llm/parsers.py +++ b/python/pathway/xpacks/llm/parsers.py @@ -28,7 +28,7 @@ def __wrapped__(self, contents: bytes) -> list[tuple[str, dict]]: # MIT licensed class ParseUnstructured(pw.UDFSync): """ - Parse document using https://unstructured.io/. + Parse document using `https://unstructured.io/ `_. All arguments can be overridden during UDF application. diff --git a/python/pathway/xpacks/llm/splitters.py b/python/pathway/xpacks/llm/splitters.py index 80d3ddc4..4da2dde0 100644 --- a/python/pathway/xpacks/llm/splitters.py +++ b/python/pathway/xpacks/llm/splitters.py @@ -42,24 +42,24 @@ class TokenCountSplitter(pw.UDFSync): All arguments set default which may be overridden in the UDF call - Arguments: + Args: min_tokens: minimum tokens in a chunk of text. max_tokens: maximum size of a chunk in tokens. encoding_name: name of the encoding from `tiktoken`. Example: - # >>> from pathway.xpacks.llm.splitters import TokenCountSplitter - # >>> import pathway as pw - # >>> t = pw.debug.table_from_markdown( - # ... '''| text - # ... 1| cooltext''' - # ... ) - # >>> splitter = TokenCountSplitter(min_tokens=1, max_tokens=1) - # >>> t += t.select(chunks = pw.apply(splitter, pw.this.text)) - # >>> pw.debug.compute_and_print(t, include_id=False) - # text | chunks - # cooltext | (('cool', pw.Json({})), ('text', pw.Json({}))) + >>> from pathway.xpacks.llm.splitters import TokenCountSplitter + >>> import pathway as pw + >>> t = pw.debug.table_from_markdown( + ... '''| text + ... 1| cooltext''' + ... ) + >>> splitter = TokenCountSplitter(min_tokens=1, max_tokens=1) + >>> t += t.select(chunks = splitter(pw.this.text)) + >>> pw.debug.compute_and_print(t, include_id=False) + text | chunks + cooltext | (('cool', pw.Json({})), ('text', pw.Json({}))) """ CHARS_PER_TOKEN = 3 diff --git a/python/pathway/xpacks/llm/vector_store.py b/python/pathway/xpacks/llm/vector_store.py index e1976458..10f9f63f 100644 --- a/python/pathway/xpacks/llm/vector_store.py +++ b/python/pathway/xpacks/llm/vector_store.py @@ -440,12 +440,31 @@ def run(): class VectorStoreClient: + """ + A client you can use to query :py:class:`VectorStoreServer`. + + Args: + - host: host on which `:py:class:`VectorStoreServer` listens + - port: port on which `:py:class:`VectorStoreServer` listens + """ + def __init__(self, host, port): self.host = host self.port = port - def query(self, query, k=3, metadata_filter=None) -> list[dict]: - """Perform a query to the vector store and fetch results.""" + def query( + self, query: str, k: int = 3, metadata_filter: str | None = None + ) -> list[dict]: + """ + Perform a query to the vector store and fetch results. + + Args: + - query: + - k: number of documents to be returned + - metadata_filter: optional string representing the metadata filtering query + in the JMESPath format. The search will happen only for documents + satisfying this filtering. + """ data = {"query": query, "k": k} if metadata_filter is not None: @@ -474,8 +493,21 @@ def get_vectorstore_statistics(self): responses = response.json() return responses - def get_input_files(self, metadata_filter=None, filepath_globpattern=None): - """Fetch basic statistics about the vector store.""" + def get_input_files( + self, + metadata_filter: str | None = None, + filepath_globpattern: str | None = None, + ): + """ + Fetch information on documents in the the vector store. + + Args: + metadata_filter: optional string representing the metadata filtering query + in the JMESPath format. The search will happen only for documents + satisfying this filtering. + filepath_globpattern: optional glob pattern specifying which documents + will be searched for this query. + """ url = f"http://{self.host}:{self.port}/v1/inputs" response = requests.post( url,