From 56ab3eba4b0286f51caa53d1b3e27aae9113c73b Mon Sep 17 00:00:00 2001 From: ChunHao <64747455+chuang8511@users.noreply.github.com> Date: Fri, 12 Jul 2024 03:10:34 +0100 Subject: [PATCH 1/4] feat(text): add input and output and fix bugs (#209) Because - position in the chunks are different from the original source - model name and encoding name are not inter-changeable - markdown chunk and recursive chunk need the token count as well - document for text component is not clear - precommit hook modify the test data which cause the test case failed This commit - fix position bugs - add model name and token count to markdown chunk and recursive chunk - take out encoding name in token chunk - use injection to make the text document clear - fix precommit hook --- .pre-commit-config.yaml | 2 +- ai/openai/v0/README.mdx | 1 + base/formats_test.go | 2 - .../text/v0/.compogen/extra-chunk-text.mdx | 46 +++++ operator/text/v0/README.mdx | 74 ++++--- operator/text/v0/chunk_text.go | 97 +++++++-- operator/text/v0/chunk_text_test.go | 148 ++++++++++++-- operator/text/v0/config/definition.json | 1 - operator/text/v0/config/tasks.json | 188 ++---------------- operator/text/v0/main.go | 18 +- operator/text/v0/main_test.go | 10 - operator/text/v0/split.go | 52 ----- operator/text/v0/split_test.go | 28 --- .../text/v0/testdata/chinese/chunk1_1.txt | 4 + .../text/v0/testdata/chinese/chunk1_2.txt | 2 + .../text/v0/testdata/chinese/chunk1_3.txt | 3 + operator/text/v0/testdata/chinese/text1.txt | 32 +++ .../v0/testdata/chinese_markdown/chunk1_1.txt | 5 + .../v0/testdata/chinese_markdown/chunk1_2.txt | 5 + .../v0/testdata/chinese_markdown/chunk1_3.txt | 5 + .../v0/testdata/chinese_markdown/text1.txt | 32 +++ .../text/v0/testdata/english/chunk1_1.txt | 3 + .../text/v0/testdata/english/chunk1_2.txt | 4 + .../text/v0/testdata/english/chunk1_3.txt | 3 + operator/text/v0/testdata/english/text1.txt | 37 ++++ tools/compogen/README.md | 4 +- 26 files changed, 458 insertions(+), 348 deletions(-) create mode 100644 operator/text/v0/.compogen/extra-chunk-text.mdx delete mode 100644 operator/text/v0/split.go delete mode 100644 operator/text/v0/split_test.go create mode 100644 operator/text/v0/testdata/chinese/chunk1_1.txt create mode 100644 operator/text/v0/testdata/chinese/chunk1_2.txt create mode 100644 operator/text/v0/testdata/chinese/chunk1_3.txt create mode 100644 operator/text/v0/testdata/chinese/text1.txt create mode 100644 operator/text/v0/testdata/chinese_markdown/chunk1_1.txt create mode 100644 operator/text/v0/testdata/chinese_markdown/chunk1_2.txt create mode 100644 operator/text/v0/testdata/chinese_markdown/chunk1_3.txt create mode 100644 operator/text/v0/testdata/chinese_markdown/text1.txt create mode 100644 operator/text/v0/testdata/english/chunk1_1.txt create mode 100644 operator/text/v0/testdata/english/chunk1_2.txt create mode 100644 operator/text/v0/testdata/english/chunk1_3.txt create mode 100644 operator/text/v0/testdata/english/text1.txt diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 163e46b0..e7f8beca 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -6,7 +6,7 @@ repos: - id: check-json - id: check-merge-conflict - id: end-of-file-fixer - exclude: tools/compogen/cmd/testdata + exclude: (?i).*testdata/ exclude_types: [svg,mdx] - id: trailing-whitespace - id: pretty-format-json diff --git a/ai/openai/v0/README.mdx b/ai/openai/v0/README.mdx index e0204104..ea13136c 100644 --- a/ai/openai/v0/README.mdx +++ b/ai/openai/v0/README.mdx @@ -85,6 +85,7 @@ Turn text into numbers, unlocking use cases like search. | Task ID (required) | `task` | string | `TASK_TEXT_EMBEDDINGS` | | Model (required) | `model` | string | ID of the model to use. You can use the [List models](/docs/api-reference/models/list) API to see all of your available models, or see our [Model overview](/docs/models/overview) for descriptions of them. | | Text (required) | `text` | string | The text | +| Dimensions | `dimensions` | integer | The number of dimensions the resulting output embeddings should have. Only supported in text-embedding-3 and later models. | diff --git a/base/formats_test.go b/base/formats_test.go index 0cf7abd0..8bddb746 100644 --- a/base/formats_test.go +++ b/base/formats_test.go @@ -3,7 +3,6 @@ package base import ( "bufio" "encoding/base64" - "fmt" "io" "os" "testing" @@ -25,7 +24,6 @@ func TestUtil_GetFileExtension(t *testing.T) { fileBase64 := base64.StdEncoding.EncodeToString(content) fileBase64 = "data:image/png;base64," + fileBase64 - fmt.Println(fileBase64) gotFileExtension := GetBase64FileExtension(fileBase64) c.Check(gotFileExtension, qt.Equals, wantFileExtension) } diff --git a/operator/text/v0/.compogen/extra-chunk-text.mdx b/operator/text/v0/.compogen/extra-chunk-text.mdx new file mode 100644 index 00000000..103d0dc4 --- /dev/null +++ b/operator/text/v0/.compogen/extra-chunk-text.mdx @@ -0,0 +1,46 @@ +### Chunking Strategy +There are three strategies available for chunking text in Text Component: +- 1. Token +- 2. Recursive +- 3. Markdown + +#### Token +Language models have a token limit. You should not exceed the token limit. When you split your text into chunks it is therefore a good idea to count the number of tokens. There are many tokenizers. When you count tokens in your text you should use the same tokenizer as used in the language model. + +| **Parameter** | **Type** | **Description** | +|----------------------|------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `chunk-size` | integer | Specifies the maximum size of each chunk in terms of the number of tokens | +| `chunk-overlap` | integer | Determines the number of tokens that overlap between consecutive chunks | +| `model-name` | string | The name of the model used for tokenization | +| `allowed-special` | array of strings | A list of special tokens that are allowed within chunks | +| `disallowed-special` | array of strings | A list of special tokens that should not appear within chunks | + +#### Recursive +This text splitter is the recommended one for generic text. It is parameterized by a list of characters. It tries to split on them in order until the chunks are small enough. The default list is ["\n\n", "\n", " ", ""]. This has the effect of trying to keep all paragraphs (and then sentences, and then words) together as long as possible, as those would generically seem to be the strongest semantically related pieces of text. + +| **Parameter** | **Type** | **Description** | +|--------------------|------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `chunk-size` | integer | Specifies the maximum size of each chunk in terms of the number of tokens | +| `chunk-overlap` | integer | Determines the number of tokens that overlap between consecutive chunks | +| `model-name` | string | The name of the model used for tokenization | +| `separators` | array of strings | A list of strings representing the separators used to split the text | +| `keep-separator` | boolean | A flag indicating whether to keep the separator characters at the beginning or end of chunks | + + +#### Markdown +This text splitter is specially designed for Markdown format. + +| **Parameter** | **Type** | **Description** | +|--------------------|----------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `chunk-size` | integer | Specifies the maximum size of each chunk in terms of the number of tokens | +| `chunk-overlap` | integer | Determines the number of tokens that overlap between consecutive chunks | +| `model-name` | string | The name of the model used for tokenization | +| `code-blocks` | boolean | A flag indicating whether code blocks should be treated as a single unit | +| `reference-links` | boolean | A flag indicating whether reference links should be kept intact | + +### Text Chunks in Output +| **Parameter** | **Type** | **Description** | +|------------------|----------|--------------------------------------------------------------| +| `test` | string | The text chunk | +| `start-position` | integer | The starting position of the text chunk in the original text | +| `end-position` | integer | The ending position of the text chunk in the original text | diff --git a/operator/text/v0/README.mdx b/operator/text/v0/README.mdx index d117e102..774260b3 100644 --- a/operator/text/v0/README.mdx +++ b/operator/text/v0/README.mdx @@ -9,7 +9,6 @@ The Text component is an operator component that allows users to extract and man It can carry out the following tasks: - [Convert To Text](#convert-to-text) -- [Split By Token](#split-by-token) - [Chunk Text](#chunk-text) @@ -54,34 +53,9 @@ Convert document to text. -### Split By Token - -Split text by token. It will be decprecated soon. Please use Chunk Text task instead. - - -| Input | ID | Type | Description | -| :--- | :--- | :--- | :--- | -| Task ID (required) | `task` | string | `TASK_SPLIT_BY_TOKEN` | -| Text (required) | `text` | string | Text to be split | -| Model (required) | `model` | string | ID of the model to use for tokenization | -| Chunk Token Size | `chunk-token-size` | integer | Number of tokens per text chunk | - - - -| Output | ID | Type | Description | -| :--- | :--- | :--- | :--- | -| Token Count | `token-count` | integer | Total count of tokens in the input text | -| Text Chunks | `text-chunks` | array[string] | Text chunks after splitting | -| Number of Text Chunks | `chunk-num` | integer | Total number of output text chunks | - - - - - - ### Chunk Text -Chunk text with different strategies. +Chunk text with different strategies | Input | ID | Type | Description | @@ -99,6 +73,52 @@ Chunk text with different strategies. | Number of Text Chunks | `chunk-num` | integer | Total number of output text chunks | +### Chunking Strategy +There are three strategies available for chunking text in Text Component: +- 1. Token +- 2. Recursive +- 3. Markdown + +#### Token +Language models have a token limit. You should not exceed the token limit. When you split your text into chunks it is therefore a good idea to count the number of tokens. There are many tokenizers. When you count tokens in your text you should use the same tokenizer as used in the language model. + +| **Parameter** | **Type** | **Description** | +|----------------------|------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `chunk-size` | integer | Specifies the maximum size of each chunk in terms of the number of tokens | +| `chunk-overlap` | integer | Determines the number of tokens that overlap between consecutive chunks | +| `model-name` | string | The name of the model used for tokenization | +| `allowed-special` | array of strings | A list of special tokens that are allowed within chunks | +| `disallowed-special` | array of strings | A list of special tokens that should not appear within chunks | + +#### Recursive +This text splitter is the recommended one for generic text. It is parameterized by a list of characters. It tries to split on them in order until the chunks are small enough. The default list is ["\n\n", "\n", " ", ""]. This has the effect of trying to keep all paragraphs (and then sentences, and then words) together as long as possible, as those would generically seem to be the strongest semantically related pieces of text. + +| **Parameter** | **Type** | **Description** | +|--------------------|------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `chunk-size` | integer | Specifies the maximum size of each chunk in terms of the number of tokens | +| `chunk-overlap` | integer | Determines the number of tokens that overlap between consecutive chunks | +| `model-name` | string | The name of the model used for tokenization | +| `separators` | array of strings | A list of strings representing the separators used to split the text | +| `keep-separator` | boolean | A flag indicating whether to keep the separator characters at the beginning or end of chunks | + + +#### Markdown +This text splitter is specially designed for Markdown format. + +| **Parameter** | **Type** | **Description** | +|--------------------|----------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `chunk-size` | integer | Specifies the maximum size of each chunk in terms of the number of tokens | +| `chunk-overlap` | integer | Determines the number of tokens that overlap between consecutive chunks | +| `model-name` | string | The name of the model used for tokenization | +| `code-blocks` | boolean | A flag indicating whether code blocks should be treated as a single unit | +| `reference-links` | boolean | A flag indicating whether reference links should be kept intact | + +### Text Chunks in Output +| **Parameter** | **Type** | **Description** | +|------------------|----------|--------------------------------------------------------------| +| `test` | string | The text chunk | +| `start-position` | integer | The starting position of the text chunk in the original text | +| `end-position` | integer | The ending position of the text chunk in the original text | diff --git a/operator/text/v0/chunk_text.go b/operator/text/v0/chunk_text.go index 628dea5b..25cde221 100644 --- a/operator/text/v0/chunk_text.go +++ b/operator/text/v0/chunk_text.go @@ -2,6 +2,7 @@ package text import ( "fmt" + "reflect" "github.com/pkoukk/tiktoken-go" "github.com/tmc/langchaingo/textsplitter" @@ -21,7 +22,6 @@ type Setting struct { ChunkSize int `json:"chunk-size,omitempty"` ChunkOverlap int `json:"chunk-overlap,omitempty"` ModelName string `json:"model-name,omitempty"` - EncodingName string `json:"encoding-name,omitempty"` AllowedSpecial []string `json:"allowed-special,omitempty"` DisallowedSpecial []string `json:"disallowed-special,omitempty"` Separators []string `json:"separators,omitempty"` @@ -41,8 +41,8 @@ type ChunkTextOutput struct { type TextChunk struct { Text string `json:"text"` - StartPosition int `json:"start-position,omitempty"` - EndPosition int `json:"end-position,omitempty"` + StartPosition int `json:"start-position"` + EndPosition int `json:"end-position"` } func (s *Setting) SetDefault() { @@ -55,9 +55,6 @@ func (s *Setting) SetDefault() { if s.ModelName == "" { s.ModelName = "gpt-3.5-turbo" } - if s.EncodingName == "" { - s.EncodingName = "cl100k_base" - } if s.AllowedSpecial == nil { s.AllowedSpecial = []string{} } @@ -77,9 +74,11 @@ func chunkText(input ChunkTextInput) (ChunkTextOutput, error) { setting.SetDefault() var output ChunkTextOutput + var positionCalculator ChunkPositionCalculator + switch setting.ChunkMethod { case "Token": - + positionCalculator = PositionCalculator{} if setting.ChunkOverlap >= setting.ChunkSize { err := fmt.Errorf("ChunkOverlap must be less than ChunkSize when using Token method") return output, err @@ -89,18 +88,11 @@ func chunkText(input ChunkTextInput) (ChunkTextOutput, error) { textsplitter.WithChunkSize(setting.ChunkSize), textsplitter.WithChunkOverlap(setting.ChunkOverlap), textsplitter.WithModelName(setting.ModelName), - textsplitter.WithEncodingName(setting.EncodingName), textsplitter.WithAllowedSpecial(setting.AllowedSpecial), textsplitter.WithDisallowedSpecial(setting.DisallowedSpecial), ) - - tkm, err := tiktoken.EncodingForModel(setting.ModelName) - if err != nil { - return output, err - } - token := tkm.Encode(input.Text, setting.AllowedSpecial, setting.DisallowedSpecial) - output.TokenCount = len(token) case "Markdown": + positionCalculator = MarkdownPositionCalculator{} split = textsplitter.NewMarkdownTextSplitter( textsplitter.WithChunkSize(setting.ChunkSize), textsplitter.WithChunkOverlap(setting.ChunkOverlap), @@ -108,6 +100,7 @@ func chunkText(input ChunkTextInput) (ChunkTextOutput, error) { textsplitter.WithReferenceLinks(setting.ReferenceLinks), ) case "Recursive": + positionCalculator = PositionCalculator{} split = textsplitter.NewRecursiveCharacter( textsplitter.WithSeparators(setting.Separators), textsplitter.WithChunkSize(setting.ChunkSize), @@ -116,21 +109,85 @@ func chunkText(input ChunkTextInput) (ChunkTextOutput, error) { ) } + tkm, err := tiktoken.EncodingForModel(setting.ModelName) + if err != nil { + return output, err + } + token := tkm.Encode(input.Text, setting.AllowedSpecial, setting.DisallowedSpecial) + output.TokenCount = len(token) + chunks, err := split.SplitText(input.Text) if err != nil { return output, err } output.ChunkNum = len(chunks) - startPosition := 1 - for _, c := range chunks { + startScanPosition := 0 + rawRunes := []rune(input.Text) + for _, chunk := range chunks { + chunkRunes := []rune(chunk) + + startPosition, endPosition := positionCalculator.getChunkPositions(rawRunes, chunkRunes, startScanPosition) output.TextChunks = append(output.TextChunks, TextChunk{ - Text: c, + Text: chunk, StartPosition: startPosition, - EndPosition: startPosition + len(c) - 1, + EndPosition: endPosition, }) - startPosition += len(c) + startScanPosition = startPosition + 1 } return output, nil } + +type ChunkPositionCalculator interface { + getChunkPositions(rawText, chunk []rune, startScanPosition int) (startPosition int, endPosition int) +} + +type PositionCalculator struct{} + +func (PositionCalculator) getChunkPositions(rawText, chunk []rune, startScanPosition int) (startPosition int, endPosition int) { + + for i := startScanPosition; i < len(rawText); i++ { + if rawText[i] == chunk[0] { + if reflect.DeepEqual(rawText[i:i+len(chunk)], chunk) { + startPosition = i + endPosition = len(chunk) + i - 1 + break + } + } + } + return startPosition, endPosition +} + +type MarkdownPositionCalculator struct{} + +func (MarkdownPositionCalculator) getChunkPositions(rawText, chunk []rune, startScanPosition int) (startPosition int, endPosition int) { + + skipHeaderIndex := getSkipHeaderIndex(chunk) + + for i := startScanPosition; i < len(rawText); i++ { + + if rawText[i] == chunk[skipHeaderIndex] { + if reflect.DeepEqual(rawText[i:(i+len(chunk)-skipHeaderIndex)], chunk[skipHeaderIndex:]) { + startPosition = i + endPosition = len(chunk) + i - 1 - skipHeaderIndex + break + } + } + } + return startPosition, endPosition +} + +func getSkipHeaderIndex(chunk []rune) int { + hashtagCount := 0 + for i := 0; i < len(chunk); i++ { + if chunk[i] == '#' { + hashtagCount++ + } + + if hashtagCount >= 1 && chunk[i] == '\n' { + return i + 1 + } + } + return 0 +} diff --git a/operator/text/v0/chunk_text_test.go b/operator/text/v0/chunk_text_test.go index 474ac626..58f0720a 100644 --- a/operator/text/v0/chunk_text_test.go +++ b/operator/text/v0/chunk_text_test.go @@ -1,6 +1,7 @@ package text import ( + "os" "testing" "github.com/frankban/quicktest" @@ -21,9 +22,9 @@ func TestChunkText(t *testing.T) { Text: "Hello world.", Strategy: Strategy{ Setting: Setting{ - ChunkMethod: "Token", - ChunkSize: 512, - ModelName: "gpt-3.5-turbo", + ChunkMethod: "Token", + ChunkSize: 512, + ModelName: "gpt-3.5-turbo", }, }, }, @@ -31,8 +32,8 @@ func TestChunkText(t *testing.T) { TextChunks: []TextChunk{ { Text: "Hello world.", - StartPosition: 1, - EndPosition: 12, + StartPosition: 0, + EndPosition: 11, }, }, ChunkNum: 1, @@ -45,8 +46,9 @@ func TestChunkText(t *testing.T) { Text: "Hello world.", Strategy: Strategy{ Setting: Setting{ - ChunkMethod: "Markdown", - ChunkSize: 5, + ChunkMethod: "Markdown", + ModelName: "gpt-3.5-turbo", + ChunkSize: 5, }, }, }, @@ -54,11 +56,12 @@ func TestChunkText(t *testing.T) { TextChunks: []TextChunk{ { Text: "Hello world.", - StartPosition: 1, - EndPosition: 12, + StartPosition: 0, + EndPosition: 11, }, }, - ChunkNum: 1, + ChunkNum: 1, + TokenCount: 3, }, }, { @@ -67,9 +70,10 @@ func TestChunkText(t *testing.T) { Text: "Hello world.", Strategy: Strategy{ Setting: Setting{ - ChunkMethod: "Recursive", - ChunkSize: 5, - Separators: []string{" ", "."}, + ChunkMethod: "Recursive", + ModelName: "gpt-3.5-turbo", + ChunkSize: 5, + Separators: []string{" ", "."}, }, }, }, @@ -77,8 +81,8 @@ func TestChunkText(t *testing.T) { TextChunks: []TextChunk{ { Text: "Hello", - StartPosition: 1, - EndPosition: 5, + StartPosition: 0, + EndPosition: 4, }, { Text: "world", @@ -86,7 +90,8 @@ func TestChunkText(t *testing.T) { EndPosition: 10, }, }, - ChunkNum: 2, + ChunkNum: 2, + TokenCount: 3, }, }, } @@ -99,3 +104,114 @@ func TestChunkText(t *testing.T) { }) } } + +func Test_ChunkPositionCalculator(t *testing.T) { + c := quicktest.New(t) + + testCases := []struct { + name string + positionCalculatorType string + rawTextFilePath string + chunkTextFilePath string + expectStartPosition int + expectEndPosition int + }{ + { + name: "Chinese text with NOT Markdown Chunking 1", + positionCalculatorType: "PositionCalculator", + rawTextFilePath: "testdata/chinese/text1.txt", + chunkTextFilePath: "testdata/chinese/chunk1_1.txt", + expectStartPosition: 0, + expectEndPosition: 35, + }, + { + name: "Chinese text with NOT Markdown Chunking 2", + positionCalculatorType: "PositionCalculator", + rawTextFilePath: "testdata/chinese/text1.txt", + chunkTextFilePath: "testdata/chinese/chunk1_2.txt", + expectStartPosition: 26, + expectEndPosition: 46, + }, + { + name: "Chinese text with NOT Markdown Chunking 3", + positionCalculatorType: "PositionCalculator", + rawTextFilePath: "testdata/chinese/text1.txt", + chunkTextFilePath: "testdata/chinese/chunk1_3.txt", + expectStartPosition: 49, + expectEndPosition: 80, + }, + { + name: "Chinese text with Markdown Chunking 1", + positionCalculatorType: "MarkdownPositionCalculator", + rawTextFilePath: "testdata/chinese_markdown/text1.txt", + chunkTextFilePath: "testdata/chinese_markdown/chunk1_1.txt", + expectStartPosition: 4, + expectEndPosition: 46, + }, + { + name: "Chinese text with Markdown Chunking 2", + positionCalculatorType: "MarkdownPositionCalculator", + rawTextFilePath: "testdata/chinese_markdown/text1.txt", + chunkTextFilePath: "testdata/chinese_markdown/chunk1_2.txt", + expectStartPosition: 49, + expectEndPosition: 91, + }, + { + name: "Chinese text with Markdown Chunking 3", + positionCalculatorType: "MarkdownPositionCalculator", + rawTextFilePath: "testdata/chinese_markdown/text1.txt", + chunkTextFilePath: "testdata/chinese_markdown/chunk1_3.txt", + expectStartPosition: 98, + expectEndPosition: 140, + }, + { + name: "English text with Markdown Chunking 1", + positionCalculatorType: "MarkdownPositionCalculator", + rawTextFilePath: "testdata/english/text1.txt", + chunkTextFilePath: "testdata/english/chunk1_1.txt", + expectStartPosition: 4, + expectEndPosition: 25, + }, + { + name: "English text with Markdown Chunking 2", + positionCalculatorType: "MarkdownPositionCalculator", + rawTextFilePath: "testdata/english/text1.txt", + chunkTextFilePath: "testdata/english/chunk1_2.txt", + expectStartPosition: 16, + expectEndPosition: 47, + }, + { + name: "English text with Markdown Chunking 3", + positionCalculatorType: "MarkdownPositionCalculator", + rawTextFilePath: "testdata/english/text1.txt", + chunkTextFilePath: "testdata/english/chunk1_3.txt", + expectStartPosition: 38, + expectEndPosition: 58, + }, + } + + for _, tc := range testCases { + c.Run(tc.name, func(c *quicktest.C) { + var calculator ChunkPositionCalculator + if tc.positionCalculatorType == "PositionCalculator" { + calculator = PositionCalculator{} + } else if tc.positionCalculatorType == "MarkdownPositionCalculator" { + calculator = MarkdownPositionCalculator{} + } + rawTextBytes, err := os.ReadFile(tc.rawTextFilePath) + c.Assert(err, quicktest.IsNil) + rawTextRunes := []rune(string(rawTextBytes)) + + chunkText, err := os.ReadFile(tc.chunkTextFilePath) + c.Assert(err, quicktest.IsNil) + + chunkTextRunes := []rune(string(chunkText)) + + startPosition, endPosition := calculator.getChunkPositions(rawTextRunes, chunkTextRunes, 0) + + c.Assert(startPosition, quicktest.Equals, tc.expectStartPosition) + c.Assert(endPosition, quicktest.Equals, tc.expectEndPosition) + + }) + } +} diff --git a/operator/text/v0/config/definition.json b/operator/text/v0/config/definition.json index a536ad0b..d5a094ce 100644 --- a/operator/text/v0/config/definition.json +++ b/operator/text/v0/config/definition.json @@ -1,7 +1,6 @@ { "availableTasks": [ "TASK_CONVERT_TO_TEXT", - "TASK_SPLIT_BY_TOKEN", "TASK_CHUNK_TEXT" ], "custom": false, diff --git a/operator/text/v0/config/tasks.json b/operator/text/v0/config/tasks.json index 590dd369..4b25d56e 100644 --- a/operator/text/v0/config/tasks.json +++ b/operator/text/v0/config/tasks.json @@ -17,7 +17,7 @@ }, "chunk-size": { "default": 512, - "description": "Specifies the maximum size of each chunk in terms of the number of tokens.", + "description": "Specifies the maximum size of each chunk in terms of the number of tokens", "instillAcceptFormats": [ "integer" ], @@ -32,7 +32,7 @@ }, "chunk-overlap": { "default": 100, - "description": "Determines the number of tokens that overlap between consecutive chunks.", + "description": "Determines the number of tokens that overlap between consecutive chunks", "instillAcceptFormats": [ "integer" ], @@ -169,149 +169,8 @@ "type": "object" } }, - "TASK_SPLIT_BY_TOKEN": { - "instillShortDescription": "Split text by token. It will be decprecated soon. Please use Chunk Text task instead.", - "input": { - "description": "Input", - "instillEditOnNodeFields": [ - "text", - "model" - ], - "instillUIOrder": 0, - "properties": { - "chunk-token-size": { - "default": 500, - "description": "Number of tokens per text chunk", - "instillAcceptFormats": [ - "integer" - ], - "instillUIOrder": 2, - "instillUpstreamTypes": [ - "value", - "reference" - ], - "minimum": 1, - "title": "Chunk Token Size", - "type": "integer" - }, - "model": { - "description": "ID of the model to use for tokenization", - "enum": [ - "gpt-4", - "gpt-3.5-turbo", - "text-davinci-003", - "text-davinci-002", - "text-davinci-001", - "text-curie-001", - "text-babbage-001", - "text-ada-001", - "davinci", - "curie", - "babbage", - "ada", - "code-davinci-002", - "code-davinci-001", - "code-cushman-002", - "code-cushman-001", - "davinci-codex", - "cushman-codex", - "text-davinci-edit-001", - "code-davinci-edit-001", - "text-embedding-ada-002", - "text-similarity-davinci-001", - "text-similarity-curie-001", - "text-similarity-babbage-001", - "text-similarity-ada-001", - "text-search-davinci-doc-001", - "text-search-curie-doc-001", - "text-search-babbage-doc-001", - "text-search-ada-doc-001", - "code-search-babbage-code-001", - "code-search-ada-code-001", - "gpt2" - ], - "instillAcceptFormats": [ - "string" - ], - "instillUIOrder": 1, - "instillUpstreamTypes": [ - "value", - "reference", - "template" - ], - "title": "Model", - "type": "string" - }, - "text": { - "description": "Text to be split", - "instillAcceptFormats": [ - "string" - ], - "instillUIMultiline": true, - "instillUIOrder": 0, - "instillUpstreamTypes": [ - "value", - "reference", - "template" - ], - "title": "Text", - "type": "string" - } - }, - "required": [ - "text", - "model" - ], - "title": "Input", - "type": "object" - }, - "output": { - "description": "Output", - "instillEditOnNodeFields": [ - "texts" - ], - "instillUIOrder": 0, - "properties": { - "chunk-num": { - "description": "Total number of output text chunks", - "instillUIOrder": 2, - "instillFormat": "integer", - "title": "Number of Text Chunks", - "type": "integer" - }, - "text-chunks": { - "description": "Text chunks after splitting", - "instillUIOrder": 1, - "instillFormat": "array:string", - "items": { - "title": "Text Chunk", - "description": "Text chunk after splitting", - "instillFormat": "string", - "instillUIMultiline": true, - "type": "string" - }, - "title": "Text Chunks", - "type": "array" - }, - "token-count": { - "description": "Total count of tokens in the input text", - "instillUIOrder": 0, - "instillFormat": "integer", - "title": "Token Count", - "type": "integer" - } - }, - "required": [ - "token-count", - "text-chunks", - "chunk-num" - ], - "title": "Output", - "type": "object" - } - }, "TASK_CHUNK_TEXT": { - "instillShortDescription": "Chunk text with different strategies.", + "instillShortDescription": "Chunk text with different strategies", "input": { "description": "Input", "instillEditOnNodeFields": [ @@ -350,29 +209,9 @@ "model-name": { "$ref": "#/$defs/model-name" }, - "encoding-name": { - "description": "The name of the encoding used to convert text into tokens.", - "enum": [ - "cl100k_base", - "p50k_base", - "r50k_base", - "p50k_edit" - ], - "instillAcceptFormats": [ - "string" - ], - "instillUIOrder": 3, - "instillUpstreamTypes": [ - "value", - "reference", - "template" - ], - "title": "Encoding Name", - "type": "string" - }, "allowed-special": { "default": [], - "description": "A list of special tokens that are allowed within chunks. ", + "description": "A list of special tokens that are allowed within chunks.", "instillAcceptFormats": [ "array:string" ], @@ -417,7 +256,6 @@ "chunk-size", "chunk-overlap", "model-name", - "encoding-name", "allowed-special", "disallowed-special" ], @@ -435,6 +273,9 @@ "chunk-overlap": { "$ref": "#/$defs/chunk-overlap" }, + "model-name": { + "$ref": "#/$defs/model-name" + }, "separators": { "default": [], "description": "A list of strings representing the separators used to split the text.", @@ -455,7 +296,7 @@ "type": "array" }, "keep-separator": { - "description": "A flag indicating whether to keep the separator characters at the beginning or end of chunks.", + "description": "A flag indicating whether to keep the separator characters at the beginning or end of chunks", "instillAcceptFormats": [ "boolean" ], @@ -476,6 +317,7 @@ "chunk-method", "chunk-size", "chunk-overlap", + "model-name", "separators", "keep-separator" ], @@ -493,8 +335,11 @@ "chunk-overlap": { "$ref": "#/$defs/chunk-overlap" }, + "model-name": { + "$ref": "#/$defs/model-name" + }, "code-blocks": { - "description": "A flag indicating whether code blocks should be treated as a single unit.", + "description": "A flag indicating whether code blocks should be treated as a single unit", "instillAcceptFormats": [ "boolean" ], @@ -508,7 +353,7 @@ "type": "boolean" }, "reference-links": { - "description": "A flag indicating whether reference links should be kept intact.", + "description": "A flag indicating whether reference links should be kept intact", "instillAcceptFormats": [ "boolean" ], @@ -529,6 +374,7 @@ "chunk-method", "chunk-size", "chunk-overlap", + "model-name", "code-blocks", "reference-links" ], @@ -590,9 +436,7 @@ } }, "required": [ - "text", - "start-position", - "end-position" + "text" ], "instillUIMultiline": true, "type": "object" diff --git a/operator/text/v0/main.go b/operator/text/v0/main.go index 27f3eac2..0643f808 100644 --- a/operator/text/v0/main.go +++ b/operator/text/v0/main.go @@ -1,4 +1,4 @@ -//go:generate compogen readme ./config ./README.mdx +//go:generate compogen readme ./config ./README.mdx --extraContents TASK_CHUNK_TEXT=.compogen/extra-chunk-text.mdx package text import ( @@ -15,7 +15,6 @@ import ( const ( taskConvertToText string = "TASK_CONVERT_TO_TEXT" - taskSplitByToken string = "TASK_SPLIT_BY_TOKEN" taskChunkText string = "TASK_CHUNK_TEXT" ) @@ -77,21 +76,6 @@ func (e *execution) Execute(_ context.Context, inputs []*structpb.Struct) ([]*st return nil, err } outputs = append(outputs, output) - case taskSplitByToken: - inputStruct := SplitByTokenInput{} - err := base.ConvertFromStructpb(input, &inputStruct) - if err != nil { - return nil, err - } - outputStruct, err := splitTextIntoChunks(inputStruct) - if err != nil { - return nil, err - } - output, err := base.ConvertToStructpb(outputStruct) - if err != nil { - return nil, err - } - outputs = append(outputs, output) case taskChunkText: inputStruct := ChunkTextInput{} err := base.ConvertFromStructpb(input, &inputStruct) diff --git a/operator/text/v0/main_test.go b/operator/text/v0/main_test.go index 18986734..bc6d30f8 100644 --- a/operator/text/v0/main_test.go +++ b/operator/text/v0/main_test.go @@ -32,16 +32,6 @@ func TestOperator(t *testing.T) { }, }, }, - { - name: "split by token", - task: "TASK_SPLIT_BY_TOKEN", - input: structpb.Struct{ - Fields: map[string]*structpb.Value{ - "text": {Kind: &structpb.Value_StringValue{StringValue: "Hello world. This is a test."}}, - "model": {Kind: &structpb.Value_StringValue{StringValue: "gpt-3.5-turbo"}}, - }, - }, - }, { name: "chunk texts", task: "TASK_CHUNK_TEXT", diff --git a/operator/text/v0/split.go b/operator/text/v0/split.go deleted file mode 100644 index c1db0def..00000000 --- a/operator/text/v0/split.go +++ /dev/null @@ -1,52 +0,0 @@ -package text - -import ( - "github.com/pkoukk/tiktoken-go" -) - -const defaultChunkTokenSize = 500 - -// SplitByTokenInput defines the input for split by token task -type SplitByTokenInput struct { - // Text: Text to split - Text string `json:"text"` - // Model: ID of the model to use for tokenization - Model string `json:"model"` - // ChunkTokenSize: Number of tokens per text chunk - ChunkTokenSize *int `json:"chunk-token-size,omitempty"` -} - -// SplitByTokenOutput defines the output for split by token task -type SplitByTokenOutput struct { - // TokenCount: Number of tokens in the text - TokenCount int `json:"token-count"` - // TextChunks: List of text chunks - TextChunks []string `json:"text-chunks"` - // ChunkNum: Number of text chunks - ChunkNum int `json:"chunk-num"` -} - -// splitTextIntoChunks splits text into text chunks based on token size -func splitTextIntoChunks(input SplitByTokenInput) (SplitByTokenOutput, error) { - output := SplitByTokenOutput{} - - if input.ChunkTokenSize == nil || *input.ChunkTokenSize <= 0 { - input.ChunkTokenSize = new(int) - *input.ChunkTokenSize = defaultChunkTokenSize - } - - tkm, err := tiktoken.EncodingForModel(input.Model) - if err != nil { - return output, err - } - - token := tkm.Encode(input.Text, nil, nil) - output.TokenCount = len(token) - output.TextChunks = []string{} - for start := 0; start < len(token); start += *input.ChunkTokenSize { - end := min(start+*input.ChunkTokenSize, len(token)) - output.TextChunks = append(output.TextChunks, tkm.Decode(token[start:end])) - } - output.ChunkNum = len(output.TextChunks) - return output, nil -} diff --git a/operator/text/v0/split_test.go b/operator/text/v0/split_test.go deleted file mode 100644 index 18c63a0d..00000000 --- a/operator/text/v0/split_test.go +++ /dev/null @@ -1,28 +0,0 @@ -package text - -import ( - "context" - "testing" - - "google.golang.org/protobuf/types/known/structpb" -) - -// TestSplitByToken tests the split by token task -func TestSplitByToken(t *testing.T) { - input := &structpb.Struct{ - Fields: map[string]*structpb.Value{ - "text": {Kind: &structpb.Value_StringValue{StringValue: "Hello world. This is a test."}}, - "model": {Kind: &structpb.Value_StringValue{StringValue: "gpt-3.5-turbo"}}, - }, - } - inputs := []*structpb.Struct{ - input, - } - - e := &execution{} - e.Task = "TASK_SPLIT_BY_TOKEN" - - if _, err := e.Execute(context.Background(), inputs); err != nil { - t.Fatalf("splitByToken returned an error: %v", err) - } -} diff --git a/operator/text/v0/testdata/chinese/chunk1_1.txt b/operator/text/v0/testdata/chinese/chunk1_1.txt new file mode 100644 index 00000000..a223772f --- /dev/null +++ b/operator/text/v0/testdata/chinese/chunk1_1.txt @@ -0,0 +1,4 @@ +# A +我怎麼知道誰比較強1 +我怎麼知道誰比較強2 +我怎麼知道誰比較強3 \ No newline at end of file diff --git a/operator/text/v0/testdata/chinese/chunk1_2.txt b/operator/text/v0/testdata/chinese/chunk1_2.txt new file mode 100644 index 00000000..151d8ba9 --- /dev/null +++ b/operator/text/v0/testdata/chinese/chunk1_2.txt @@ -0,0 +1,2 @@ +我怎麼知道誰比較強3 +我怎麼知道誰比較強4 \ No newline at end of file diff --git a/operator/text/v0/testdata/chinese/chunk1_3.txt b/operator/text/v0/testdata/chinese/chunk1_3.txt new file mode 100644 index 00000000..aac3020f --- /dev/null +++ b/operator/text/v0/testdata/chinese/chunk1_3.txt @@ -0,0 +1,3 @@ +我怎麼知道誰比較強5 +我怎麼知道誰比較強6 +我怎麼知道誰比較強7 \ No newline at end of file diff --git a/operator/text/v0/testdata/chinese/text1.txt b/operator/text/v0/testdata/chinese/text1.txt new file mode 100644 index 00000000..f9e111ac --- /dev/null +++ b/operator/text/v0/testdata/chinese/text1.txt @@ -0,0 +1,32 @@ +# A +我怎麼知道誰比較強1 +我怎麼知道誰比較強2 +我怎麼知道誰比較強3 +我怎麼知道誰比較強4 + +我怎麼知道誰比較強5 +我怎麼知道誰比較強6 +我怎麼知道誰比較強7 +我怎麼知道誰比較強8 +## B +我怎麼知道誰比較若9 +我怎麼知道誰比較若0 +我怎麼知道誰比較若1 +若若若若若若若若若2 +我怎麼知道誰比較若3 +若若若若若若若若若4 +我怎麼知道誰比較若5 +若若若若若若若若若6 + + +### C +不知道就不要亂說話7 +不知道就不要亂說話8 +不知道就不要亂說話9 +不知道就不要亂說話0 +不知道就不要亂說話1 +不知道就不要亂說話2 +不知道就不要亂說話3 + +不知道就不要亂說話4 +不知道就不要亂說話5 \ No newline at end of file diff --git a/operator/text/v0/testdata/chinese_markdown/chunk1_1.txt b/operator/text/v0/testdata/chinese_markdown/chunk1_1.txt new file mode 100644 index 00000000..ff10675f --- /dev/null +++ b/operator/text/v0/testdata/chinese_markdown/chunk1_1.txt @@ -0,0 +1,5 @@ +# A +我怎麼知道誰比較強1 +我怎麼知道誰比較強2 +我怎麼知道誰比較強3 +我怎麼知道誰比較強4 \ No newline at end of file diff --git a/operator/text/v0/testdata/chinese_markdown/chunk1_2.txt b/operator/text/v0/testdata/chinese_markdown/chunk1_2.txt new file mode 100644 index 00000000..4b4758c7 --- /dev/null +++ b/operator/text/v0/testdata/chinese_markdown/chunk1_2.txt @@ -0,0 +1,5 @@ +# A +我怎麼知道誰比較強5 +我怎麼知道誰比較強6 +我怎麼知道誰比較強7 +我怎麼知道誰比較強8 \ No newline at end of file diff --git a/operator/text/v0/testdata/chinese_markdown/chunk1_3.txt b/operator/text/v0/testdata/chinese_markdown/chunk1_3.txt new file mode 100644 index 00000000..38d8655d --- /dev/null +++ b/operator/text/v0/testdata/chinese_markdown/chunk1_3.txt @@ -0,0 +1,5 @@ +## B +我怎麼知道誰比較若9 +我怎麼知道誰比較若0 +我怎麼知道誰比較若1 +若若若若若若若若若2 \ No newline at end of file diff --git a/operator/text/v0/testdata/chinese_markdown/text1.txt b/operator/text/v0/testdata/chinese_markdown/text1.txt new file mode 100644 index 00000000..f9e111ac --- /dev/null +++ b/operator/text/v0/testdata/chinese_markdown/text1.txt @@ -0,0 +1,32 @@ +# A +我怎麼知道誰比較強1 +我怎麼知道誰比較強2 +我怎麼知道誰比較強3 +我怎麼知道誰比較強4 + +我怎麼知道誰比較強5 +我怎麼知道誰比較強6 +我怎麼知道誰比較強7 +我怎麼知道誰比較強8 +## B +我怎麼知道誰比較若9 +我怎麼知道誰比較若0 +我怎麼知道誰比較若1 +若若若若若若若若若2 +我怎麼知道誰比較若3 +若若若若若若若若若4 +我怎麼知道誰比較若5 +若若若若若若若若若6 + + +### C +不知道就不要亂說話7 +不知道就不要亂說話8 +不知道就不要亂說話9 +不知道就不要亂說話0 +不知道就不要亂說話1 +不知道就不要亂說話2 +不知道就不要亂說話3 + +不知道就不要亂說話4 +不知道就不要亂說話5 \ No newline at end of file diff --git a/operator/text/v0/testdata/english/chunk1_1.txt b/operator/text/v0/testdata/english/chunk1_1.txt new file mode 100644 index 00000000..dd7935a3 --- /dev/null +++ b/operator/text/v0/testdata/english/chunk1_1.txt @@ -0,0 +1,3 @@ +# A +I love this +123456789A \ No newline at end of file diff --git a/operator/text/v0/testdata/english/chunk1_2.txt b/operator/text/v0/testdata/english/chunk1_2.txt new file mode 100644 index 00000000..3749b13d --- /dev/null +++ b/operator/text/v0/testdata/english/chunk1_2.txt @@ -0,0 +1,4 @@ +# A +123456789A +123456789B +123456789C \ No newline at end of file diff --git a/operator/text/v0/testdata/english/chunk1_3.txt b/operator/text/v0/testdata/english/chunk1_3.txt new file mode 100644 index 00000000..66a4eb88 --- /dev/null +++ b/operator/text/v0/testdata/english/chunk1_3.txt @@ -0,0 +1,3 @@ +# A +123456789C +123456789D \ No newline at end of file diff --git a/operator/text/v0/testdata/english/text1.txt b/operator/text/v0/testdata/english/text1.txt new file mode 100644 index 00000000..88d87c3c --- /dev/null +++ b/operator/text/v0/testdata/english/text1.txt @@ -0,0 +1,37 @@ +# A +I love this +123456789A +123456789B +123456789C +123456789D + +## B +9876543210E +9876543210F +9876543210G +9876543210H +9876543210I +9876543210J +9876543210K + + +# C +afdgadfg1 +terterert2 +qertqqert3 +qertqrt4 +qertertqer5 + +#### D +3651351616a +146546136469b +16358416841c +65416481684d +161468416e + +### E +afgaergergt6 +ergergergerg7 +qeargergergerg8 +qergqergqerg9 +qergqergergrg0 \ No newline at end of file diff --git a/tools/compogen/README.md b/tools/compogen/README.md index 9451f330..aa96c725 100644 --- a/tools/compogen/README.md +++ b/tools/compogen/README.md @@ -58,7 +58,7 @@ documentation. For instance, one might want to dedicate a section to add a guide to configuring an account in a 3rd party vendor or to explain in details a particular configuration of a component. -The `exttraContents` flag in the `readme` subcommand lets `compogen` inject the +The `extraContents` flag in the `readme` subcommand lets `compogen` inject the content of a document into the generated file. The content will be added verbatim, so it should complain with the MDX syntax. @@ -70,7 +70,7 @@ The following section IDs are accepted: - `release` - `config` - `setup` -- Any task ID defined in `tasks.json` (e.g. `TASK_SPLIT_BY_TOKEN`) +- Any task ID defined in `tasks.json` (e.g. `TASK_CHUNK_TEXT`) - `bottom` More than one section can be extended with this flag: From 4b8bbc7ad39600d115f5f422c69e04181ae497a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=A5=8A=E7=AB=A3=E5=87=B1?= <85488391+YCK1130@users.noreply.github.com> Date: Fri, 12 Jul 2024 03:10:50 +0100 Subject: [PATCH 2/4] feat: GitHub component pagination (#212) Because - Users may want to control how much data they wish to receive. This commit - Add pagination options to all list tasks - Update readme --- application/github/v0/README.mdx | 86 ++++++++++++---- application/github/v0/config/tasks.json | 127 ++++++++++++++++++++++++ application/github/v0/issues.go | 5 + application/github/v0/pull_request.go | 9 ++ application/github/v0/review_comment.go | 5 + application/github/v0/utils.go | 5 + 6 files changed, 219 insertions(+), 18 deletions(-) diff --git a/application/github/v0/README.mdx b/application/github/v0/README.mdx index bba692a0..9416c6b0 100644 --- a/application/github/v0/README.mdx +++ b/application/github/v0/README.mdx @@ -11,7 +11,7 @@ It can carry out the following tasks: - [List Pull Requests](#list-pull-requests) - [Get Pull Request](#get-pull-request) - [Get Commit](#get-commit) -- [Get Review Comments](#get-review-comments) +- [List Review Comments](#list-review-comments) - [Create Review Comment](#create-review-comment) - [List Issues](#list-issues) - [Get Issue](#get-issue) @@ -58,12 +58,14 @@ Get the list of all pull requests in a repository | State | `state` | string | State of the PRs, including open, closed, all. Default is open | | Sort | `sort` | string | Sort the PRs by created, updated, popularity, or long-running. Default is created | | Direction | `direction` | string | Direction of the sort, including asc or desc. Default is desc | +| Page | `page` | integer | Page number of the results to fetch. Default is 1 | +| Per Page | `per-page` | integer | Number of results to fetch per page. Default is 30 | | Output | ID | Type | Description | | :--- | :--- | :--- | :--- | -| Pull Requests | `pull_requests` | array[object] | An array of PRs | +| Pull Requests | `pull-requests` | array[object] | An array of PRs | @@ -80,13 +82,24 @@ Get a pull request from a repository, given the PR number. This will default to | Task ID (required) | `task` | string | `TASK_GET_PULL_REQUEST` | | Owner (required) | `owner` | string | Owner of the repository | | Repository (required) | `repository` | string | Repository name | -| PR Number | `pr_number` | integer | Number of the PR. `0` for the latest PR | +| PR Number | `pr-number` | integer | Number of the PR. `0` for the latest PR | | Output | ID | Type | Description | | :--- | :--- | :--- | :--- | -| Pull Request | `pull_request` | object | A pull request in GitHub | +| PR id (optional) | `id` | integer | id of the PR | +| PR number (optional) | `number` | integer | number of the PR | +| PR state (optional) | `state` | string | state of the PR | +| PR Title (optional) | `title` | string | Title of the PR | +| PR body (optional) | `body` | string | Body of the PR | +| PR diff url (optional) | `diff-url` | string | url to the diff of the PR | +| PR head (optional) | `head` | string | head commit of the PR (in SHA value) | +| PR base (optional) | `base` | string | base commit of the PR (in SHA value) | +| Number of PR comments (optional) | `comments-num` | integer | number of comments on the PR | +| Number of PR commits (optional) | `commits-num` | integer | number of commits in the PR | +| Number of PR review comments (optional) | `review-comments-num` | integer | number of review comments in the PR | +| Commits (optional) | `commits` | array[object] | commits in the PR | @@ -109,27 +122,32 @@ Get a commit from a repository, given the commit SHA | Output | ID | Type | Description | | :--- | :--- | :--- | :--- | -| Commit | `commit` | object | A commit in GitHub | +| Commit SHA (optional) | `sha` | string | SHA of the commit | +| Commit message (optional) | `message` | string | message of the commit | +| Commit stats (optional) | `stats` | object | stats of changes | +| Files (optional) | `files` | array[object] | files in the commit | -### Get Review Comments +### List Review Comments Get the review comments in a pull request | Input | ID | Type | Description | | :--- | :--- | :--- | :--- | -| Task ID (required) | `task` | string | `TASK_GET_REVIEW_COMMENTS` | +| Task ID (required) | `task` | string | `TASK_LIST_REVIEW_COMMENTS` | | Owner (required) | `owner` | string | Owner of the repository | | Repository (required) | `repository` | string | Repository name | -| PR Number | `pr_number` | integer | Number of the PR. Default `0` is the latest PR | +| PR Number | `pr-number` | integer | Number of the PR. Default is `0`, which retrieves all comments on all PRs in the repository | | Sort | `sort` | string | Sort the comments by created, updated. Default is created | | Direction | `direction` | string | Direction of the sort, including asc or desc. Default is desc | | Since | `since` | string | Only comments updated at or after this time are returned. Default is 2021-01-01T00:00:00Z | +| Page | `page` | integer | Page number of the results to fetch. Default is 1 | +| Per Page | `per-page` | integer | Number of results to fetch per page. Default is 30 | @@ -152,14 +170,26 @@ Create a review comment in pull request. | Task ID (required) | `task` | string | `TASK_CREATE_REVIEW_COMMENT` | | Owner (required) | `owner` | string | Owner of the repository | | Repository (required) | `repository` | string | Repository name | -| PR Number (required) | `pr_number` | integer | Number of the PR | +| PR Number (required) | `pr-number` | integer | Number of the PR | | Comment (required) | `comment` | object | The comment to be added | | Output | ID | Type | Description | | :--- | :--- | :--- | :--- | -| Review Comment | `comment` | object | The created comment | +| Comment id (optional) | `id` | integer | ID of the comment | +| In Reply To (optional) | `in-reply-to-id` | integer | ID of the comment this comment is in reply to | +| Commit SHA (optional) | `commitId` | string | SHA of the commit on which you want to comment | +| Comment body (optional) | `body` | string | Body of the comment | +| Comment path (optional) | `path` | string | Path of the file the comment is on | +| Comment end line (optional) | `line` | integer | The line of the blob in the pull request diff that the comment applies to. For a multi-line comment, the last line of the range that your comment applies to. | +| Comment start line (optional) | `start-line` | integer | The first line in the pull request diff that your multi-line comment applies to. Only multi-line comment needs to fill this field. | +| Comment end side (optional) | `side` | string | Side of the end line, can be one of: LEFT, RIGHT, side. LEFT is the left side of the diff (deletion), RIGHT is the right side of the diff (addition), and side is the comment on the PR as a whole. Default is side | +| Comment start side (optional) | `start-side` | string | Side of the start line, can be one of: LEFT, RIGHT, side. LEFT is the left side of the diff (deletion), RIGHT is the right side of the diff (addition), and side is the comment on the PR as a whole. Default is side | +| Comment type (optional) | `subject-type` | string | Subject type of the comment, can be one of: line, file. Default is line | +| Comment created at (optional) | `created-at` | string | Time the comment was created | +| Comment updated at (optional) | `updated-at` | string | Time the comment was updated | +| User (optional) | `user` | object | User who created the comment | @@ -180,7 +210,9 @@ Get the list of all issues in a repository | Sort | `sort` | string | Sort the issues by created, updated, popularity, or long-running. Default is created | | Direction | `direction` | string | Direction of the sort, can be one of: asc, desc. Default is desc | | Since | `since` | string | Only issues updated at or after this time are returned. Default is 2021-01-01T00:00:00Z | -| No Pull Request | `no_pull_request` | boolean | Whether to not include pull requests in the issues. Default is false | +| No Pull Request | `no-pull-request` | boolean | Whether to `not` include pull requests in the response. Since issue and pr use the same indexing system in GitHub, the API returns all relevant objects (issues and pr). Default is false | +| Page | `page` | integer | Page number of the results to fetch. Default is 1 | +| Per Page | `per-page` | integer | Number of results to fetch per page. Default is 30 | @@ -203,13 +235,20 @@ Get an issue. | Task ID (required) | `task` | string | `TASK_GET_ISSUE` | | Owner (required) | `owner` | string | Owner of the repository | | Repository (required) | `repository` | string | Repository name | -| Issue Number (required) | `issue_number` | integer | Number of the issue | +| Issue Number (required) | `issue-number` | integer | Number of the issue | | Output | ID | Type | Description | | :--- | :--- | :--- | :--- | -| Issue | `issue` | object | An issue in GitHub | +| Issue number (optional) | `number` | integer | Number of the issue | +| Issue state (optional) | `state` | string | State of the issue | +| Issue title (optional) | `title` | string | Title of the issue | +| Issue body (optional) | `body` | string | Body of the issue | +| Assignee (optional) | `assignee` | string | Assignee of the issue | +| Assignees (optional) | `assignees` | array[string] | Assignees of the issue | +| Labels (optional) | `labels` | array[string] | Labels of the issue | +| Is Pull Request (optional) | `is-pull-request` | boolean | Whether the issue is a pull request | @@ -231,7 +270,14 @@ Get an issue. | Output | ID | Type | Description | | :--- | :--- | :--- | :--- | -| Issue | `issue` | object | The created issue | +| Issue number (optional) | `number` | integer | Number of the issue | +| Issue state (optional) | `state` | string | State of the issue | +| Issue title (optional) | `title` | string | Title of the issue | +| Issue body (optional) | `body` | string | Body of the issue | +| Assignee (optional) | `assignee` | string | Assignee of the issue | +| Assignees (optional) | `assignees` | array[string] | Assignees of the issue | +| Labels (optional) | `labels` | array[string] | Labels of the issue | +| Is Pull Request (optional) | `is-pull-request` | boolean | Whether the issue is a pull request | @@ -246,17 +292,21 @@ Get an issue. | Task ID (required) | `task` | string | `TASK_CREATE_WEBHOOK` | | Owner (required) | `owner` | string | Owner of the repository | | Repository (required) | `repository` | string | Repository name | -| Webhook URL (required) | `hook_url` | string | URL to send the payload to | +| Webhook URL (required) | `hook-url` | string | URL to send the payload to | | Events (required) | `events` | array[string] | Events to trigger the webhook. Please see https://docs.github.com/en/webhooks/webhook-events-and-payloads for more information | | Active | `active` | boolean | Whether the webhook is active. Default is false | -| Content Type | `content_type` | string | Content type of the webhook, can be one of: json, form. Default is json | -| Hook Secret | `hook_secret` | string | If provided, the secret will be used as the key to generate the HMAC hex digest value for delivery signature headers. (see https://docs.github.com/en/webhooks/webhook-events-and-payloads#delivery-headers) | +| Content Type | `content-type` | string | Content type of the webhook, can be one of: json, form. Default is json | +| Hook Secret | `hook-secret` | string | If provided, the secret will be used as the key to generate the HMAC hex digest value for delivery signature headers. (see https://docs.github.com/en/webhooks/webhook-events-and-payloads#delivery-headers) | | Output | ID | Type | Description | | :--- | :--- | :--- | :--- | -| Webhook | `hook` | object | The created webhook | +| Webhook ID (optional) | `id` | integer | ID of the webhook | +| Webhook URL (optional) | `url` | string | URL of the webhook | +| Ping URL (optional) | `ping-url` | string | URL to ping the webhook | +| Test URL (optional) | `test-url` | string | URL to test the webhook | +| Config (optional) | `config` | object | Configuration of the webhook | diff --git a/application/github/v0/config/tasks.json b/application/github/v0/config/tasks.json index e1bc6477..cbe96695 100644 --- a/application/github/v0/config/tasks.json +++ b/application/github/v0/config/tasks.json @@ -428,6 +428,43 @@ }, "title": "Issue", "type": "object" + }, + "page-options": { + "title": "Page Options", + "description": "Options for listing pages", + "instillFormat": "object", + "type": "object", + "required": [], + "page": { + "default": 1, + "description": "Page number of the results to fetch. Default is 1", + "instillUIOrder": 100, + "title": "Page", + "instillFormat": "integer", + "instillAcceptFormats": [ + "integer" + ], + "instillUpstreamTypes": [ + "value", + "reference" + ], + "type": "integer" + }, + "per-page": { + "default": 30, + "description": "Number of results to fetch per page. Default is 30", + "instillUIOrder": 101, + "title": "Per Page", + "instillFormat": "integer", + "instillAcceptFormats": [ + "integer" + ], + "instillUpstreamTypes": [ + "value", + "reference" + ], + "type": "integer" + } } }, "TASK_LIST_PULL_REQUESTS": { @@ -501,6 +538,36 @@ ], "instillUIOrder": 12, "type": "string" + }, + "page": { + "default": 1, + "description": "Page number of the results to fetch. Default is 1", + "instillUIOrder": 100, + "title": "Page", + "instillFormat": "integer", + "instillAcceptFormats": [ + "integer" + ], + "instillUpstreamTypes": [ + "value", + "reference" + ], + "type": "integer" + }, + "per-page": { + "default": 30, + "description": "Number of results to fetch per page. Default is 30", + "instillUIOrder": 101, + "title": "Per Page", + "instillFormat": "integer", + "instillAcceptFormats": [ + "integer" + ], + "instillUpstreamTypes": [ + "value", + "reference" + ], + "type": "integer" } }, "required": [ @@ -662,6 +729,36 @@ ], "instillUIOrder": 13, "type": "string" + }, + "page": { + "default": 1, + "description": "Page number of the results to fetch. Default is 1", + "instillUIOrder": 100, + "title": "Page", + "instillFormat": "integer", + "instillAcceptFormats": [ + "integer" + ], + "instillUpstreamTypes": [ + "value", + "reference" + ], + "type": "integer" + }, + "per-page": { + "default": 30, + "description": "Number of results to fetch per page. Default is 30", + "instillUIOrder": 101, + "title": "Per Page", + "instillFormat": "integer", + "instillAcceptFormats": [ + "integer" + ], + "instillUpstreamTypes": [ + "value", + "reference" + ], + "type": "integer" } }, "required": [ @@ -931,6 +1028,36 @@ ], "instillUIOrder": 14, "type": "boolean" + }, + "page": { + "default": 1, + "description": "Page number of the results to fetch. Default is 1", + "instillUIOrder": 100, + "title": "Page", + "instillFormat": "integer", + "instillAcceptFormats": [ + "integer" + ], + "instillUpstreamTypes": [ + "value", + "reference" + ], + "type": "integer" + }, + "per-page": { + "default": 30, + "description": "Number of results to fetch per page. Default is 30", + "instillUIOrder": 101, + "title": "Per Page", + "instillFormat": "integer", + "instillAcceptFormats": [ + "integer" + ], + "instillUpstreamTypes": [ + "value", + "reference" + ], + "type": "integer" } }, "required": [ diff --git a/application/github/v0/issues.go b/application/github/v0/issues.go index b308f222..2eda3109 100644 --- a/application/github/v0/issues.go +++ b/application/github/v0/issues.go @@ -79,6 +79,7 @@ type ListIssuesInput struct { Direction string `json:"direction"` Since string `json:"since"` NoPullRequest bool `json:"no-pull-request"` + PageOptions } type ListIssuesResp struct { @@ -105,6 +106,10 @@ func (githubClient *Client) listIssuesTask(ctx context.Context, props *structpb. Sort: inputStruct.Sort, Direction: inputStruct.Direction, Since: *sinceTime, + ListOptions: github.ListOptions{ + Page: inputStruct.Page, + PerPage: min(inputStruct.PerPage, 100), // GitHub API only allows 100 per page + }, } if opts.Mentioned == "none" { opts.Mentioned = "" diff --git a/application/github/v0/pull_request.go b/application/github/v0/pull_request.go index bd01f36f..c4d59b5f 100644 --- a/application/github/v0/pull_request.go +++ b/application/github/v0/pull_request.go @@ -67,6 +67,7 @@ type ListPullRequestsInput struct { State string `json:"state"` Sort string `json:"sort"` Direction string `json:"direction"` + PageOptions } type ListPullRequestsResp struct { PullRequests []PullRequest `json:"pull-requests"` @@ -88,6 +89,10 @@ func (githubClient *Client) listPullRequestsTask(ctx context.Context, props *str State: inputStruct.State, Sort: inputStruct.Sort, Direction: inputStruct.Direction, + ListOptions: github.ListOptions{ + Page: inputStruct.Page, + PerPage: min(inputStruct.PerPage, 100), // GitHub API only allows 100 per page + }, } prs, _, err := githubClient.PullRequests.List(ctx, owner, repository, opts) if err != nil { @@ -145,6 +150,10 @@ func (githubClient *Client) getPullRequestTask(ctx context.Context, props *struc State: "all", Sort: "created", Direction: "desc", + ListOptions: github.ListOptions{ + Page: 1, + PerPage: 1, + }, } prs, _, err := githubClient.PullRequests.List(ctx, owner, repository, opts) if err != nil { diff --git a/application/github/v0/review_comment.go b/application/github/v0/review_comment.go index 20294703..51a7b0ae 100644 --- a/application/github/v0/review_comment.go +++ b/application/github/v0/review_comment.go @@ -26,6 +26,7 @@ type ListReviewCommentsInput struct { Sort string `json:"sort"` Direction string `json:"direction"` Since string `json:"since"` + PageOptions } type ListReviewCommentsResp struct { @@ -56,6 +57,10 @@ func (githubClient *Client) listReviewCommentsTask(ctx context.Context, props *s Sort: inputStruct.Sort, Direction: inputStruct.Direction, Since: *sinceTime, + ListOptions: github.ListOptions{ + Page: inputStruct.Page, + PerPage: min(inputStruct.PerPage, 100), // GitHub API only allows 100 per page + }, } number := inputStruct.PrNumber comments, _, err := githubClient.PullRequests.ListComments(ctx, owner, repository, number, opts) diff --git a/application/github/v0/utils.go b/application/github/v0/utils.go index e5ed5b34..e2d58a1e 100644 --- a/application/github/v0/utils.go +++ b/application/github/v0/utils.go @@ -7,6 +7,11 @@ import ( "github.com/instill-ai/x/errmsg" ) +type PageOptions struct { + Page int `json:"page"` + PerPage int `json:"per-page"` +} + func middleWare(req string) int { if req == "rate_limit" { return 403 From 80415b1467a61348b6d8d32c7199f73de2b6256e Mon Sep 17 00:00:00 2001 From: ChunHao <64747455+chuang8511@users.noreply.github.com> Date: Sun, 14 Jul 2024 14:18:13 +0100 Subject: [PATCH 3/4] feat(cohere): add cohere to be able to use instill credit (#213) Because - we should expose users' input & output for instill credit calculation - The logic between cloud & local should align This commit - expose users' input & output - refactor the embedding function to make the business logic function exposed --- ai/cohere/v0/component_test.go | 16 +-- ai/cohere/v0/config/tasks.json | 31 ++++++ ai/cohere/v0/embedding.go | 166 ++++++++++++++++++++------------ ai/cohere/v0/main.go | 2 +- ai/cohere/v0/rerank.go | 8 +- ai/cohere/v0/text_generation.go | 8 +- store/store.go | 8 +- 7 files changed, 157 insertions(+), 82 deletions(-) diff --git a/ai/cohere/v0/component_test.go b/ai/cohere/v0/component_test.go index fe8ab18a..1b216821 100644 --- a/ai/cohere/v0/component_test.go +++ b/ai/cohere/v0/component_test.go @@ -60,10 +60,10 @@ func TestComponent_Tasks(t *testing.T) { commandTc := struct { input map[string]any - wantResp textGenerationOutput + wantResp TextGenerationOutput }{ input: map[string]any{"model-name": "command-r-plus"}, - wantResp: textGenerationOutput{Text: "Hi! My name is command-r-plus.", Citations: []citation{}, Usage: commandUsage{InputTokens: 20, OutputTokens: 30}}, + wantResp: TextGenerationOutput{Text: "Hi! My name is command-r-plus.", Citations: []citation{}, Usage: commandUsage{InputTokens: 20, OutputTokens: 30}}, } c.Run("ok - task command", func(c *qt.C) { @@ -92,10 +92,10 @@ func TestComponent_Tasks(t *testing.T) { embedFloatTc := struct { input map[string]any - wantResp embeddingFloatOutput + wantResp EmbeddingFloatOutput }{ input: map[string]any{"text": "abcde"}, - wantResp: embeddingFloatOutput{Embedding: []float64{0.1, 0.2, 0.3, 0.4, 0.5}, Usage: embedUsage{Tokens: 20}}, + wantResp: EmbeddingFloatOutput{Embedding: []float64{0.1, 0.2, 0.3, 0.4, 0.5}, Usage: embedUsage{Tokens: 20}}, } c.Run("ok - task float embed", func(c *qt.C) { @@ -123,10 +123,10 @@ func TestComponent_Tasks(t *testing.T) { embedIntTc := struct { input map[string]any - wantResp embeddingIntOutput + wantResp EmbeddingIntOutput }{ input: map[string]any{"text": "abcde", "embedding-type": "int8"}, - wantResp: embeddingIntOutput{Embedding: []int{1, 2, 3, 4, 5}, Usage: embedUsage{Tokens: 20}}, + wantResp: EmbeddingIntOutput{Embedding: []int{1, 2, 3, 4, 5}, Usage: embedUsage{Tokens: 20}}, } c.Run("ok - task int embed", func(c *qt.C) { @@ -154,10 +154,10 @@ func TestComponent_Tasks(t *testing.T) { rerankTc := struct { input map[string]any - wantResp rerankOutput + wantResp RerankOutput }{ input: map[string]any{"documents": []string{"a", "b", "c", "d"}}, - wantResp: rerankOutput{Ranking: []string{"d", "c", "b", "a"}, Usage: rerankUsage{Search: 5}, Relevance: []float64{10, 9, 8, 7}}, + wantResp: RerankOutput{Ranking: []string{"d", "c", "b", "a"}, Usage: rerankUsage{Search: 5}, Relevance: []float64{10, 9, 8, 7}}, } c.Run("ok - task rerank", func(c *qt.C) { setup, err := structpb.NewStruct(map[string]any{ diff --git a/ai/cohere/v0/config/tasks.json b/ai/cohere/v0/config/tasks.json index e6e096a2..af957f7c 100644 --- a/ai/cohere/v0/config/tasks.json +++ b/ai/cohere/v0/config/tasks.json @@ -262,6 +262,17 @@ "instillAcceptFormats": [ "string" ], + "instillCredentialMap": { + "values": [ + "command-r-plus", + "command-r", + "command", + "command-light" + ], + "targets": [ + "setup.api-key" + ] + }, "instillUIOrder": 0, "instillUpstreamTypes": [ "value", @@ -436,6 +447,17 @@ "instillAcceptFormats": [ "string" ], + "instillCredentialMap": { + "values": [ + "embed-english-v3.0", + "embed-multilingual-v3.0", + "embed-english-light-v3.0", + "embed-multilingual-light-v3.0" + ], + "targets": [ + "setup.api-key" + ] + }, "instillUIOrder": 0, "instillUpstreamTypes": [ "value", @@ -561,6 +583,15 @@ "instillAcceptFormats": [ "string" ], + "instillCredentialMap": { + "values": [ + "rerank-english-v3.0", + "rerank-multilingual-v3.0" + ], + "targets": [ + "setup.api-key" + ] + }, "instillUIOrder": 0, "instillUpstreamTypes": [ "value", diff --git a/ai/cohere/v0/embedding.go b/ai/cohere/v0/embedding.go index 4cc71529..dd5cc610 100644 --- a/ai/cohere/v0/embedding.go +++ b/ai/cohere/v0/embedding.go @@ -8,19 +8,19 @@ import ( "google.golang.org/protobuf/types/known/structpb" ) -type embeddingInput struct { +type EmbeddingInput struct { Text string `json:"text"` ModelName string `json:"model-name"` InputType string `json:"input-type"` EmbeddingType string `json:"embedding-type"` } -type embeddingFloatOutput struct { +type EmbeddingFloatOutput struct { Usage embedUsage `json:"usage"` Embedding []float64 `json:"embedding"` } -type embeddingIntOutput struct { +type EmbeddingIntOutput struct { Usage embedUsage `json:"usage"` Embedding []int `json:"embedding"` } @@ -30,24 +30,77 @@ type embedUsage struct { } func (e *execution) taskEmbedding(in *structpb.Struct) (*structpb.Struct, error) { - inputStruct := embeddingInput{} + inputStruct := EmbeddingInput{} err := base.ConvertFromStructpb(in, &inputStruct) if err != nil { return nil, fmt.Errorf("error generating input struct: %v", err) } + if IsEmbeddingOutputInt(inputStruct.EmbeddingType) { + tokenCount, embedding, err := processWithIntOutput(e, inputStruct) + if err != nil { + return nil, err + } + + outputStruct := EmbeddingIntOutput{ + Usage: embedUsage{ + Tokens: tokenCount, + }, + Embedding: embedding, + } + output, err := base.ConvertToStructpb(outputStruct) + if err != nil { + return nil, err + } + return output, nil + } + + tokenCount, embedding, err := processWithFloatOutput(e, inputStruct) + if err != nil { + return nil, err + } + outputStruct := EmbeddingFloatOutput{ + Usage: embedUsage{ + Tokens: tokenCount, + }, + Embedding: embedding, + } + output, err := base.ConvertToStructpb(outputStruct) + if err != nil { + return nil, err + } + return output, nil + +} + +func IsEmbeddingOutputInt(embeddingType string) bool { + return embeddingType == "int8" || embeddingType == "uint8" || embeddingType == "binary" || embeddingType == "ubinary" +} + +func processWithIntOutput(e *execution, inputStruct EmbeddingInput) (tokenCount int, embedding []int, err error) { + req := cohereSDK.EmbedRequest{ + Texts: []string{inputStruct.Text}, + Model: &inputStruct.ModelName, + InputType: (*cohereSDK.EmbedInputType)(&inputStruct.InputType), + EmbeddingTypes: []cohereSDK.EmbeddingType{cohereSDK.EmbeddingType(inputStruct.EmbeddingType)}, + } + resp, err := e.client.generateEmbedding(req) + + if err != nil { + return 0, nil, err + } + + embeddingResult, err := getIntEmbedding(resp, inputStruct.EmbeddingType) + if err != nil { + return 0, nil, err + } + return getBillingTokens(resp, inputStruct.EmbeddingType), embeddingResult, nil +} + +func processWithFloatOutput(e *execution, inputStruct EmbeddingInput) (tokenCount int, embedding []float64, err error) { embeddingTypeArray := []cohereSDK.EmbeddingType{} - switch inputStruct.EmbeddingType { - case "float": + if inputStruct.EmbeddingType == "float" { embeddingTypeArray = append(embeddingTypeArray, cohereSDK.EmbeddingTypeFloat) - case "int8": - embeddingTypeArray = append(embeddingTypeArray, cohereSDK.EmbeddingTypeInt8) - case "uint8": - embeddingTypeArray = append(embeddingTypeArray, cohereSDK.EmbeddingTypeUint8) - case "binary": - embeddingTypeArray = append(embeddingTypeArray, cohereSDK.EmbeddingTypeBinary) - case "ubinary": - embeddingTypeArray = append(embeddingTypeArray, cohereSDK.EmbeddingTypeUbinary) } req := cohereSDK.EmbedRequest{ Texts: []string{inputStruct.Text}, @@ -56,58 +109,43 @@ func (e *execution) taskEmbedding(in *structpb.Struct) (*structpb.Struct, error) EmbeddingTypes: embeddingTypeArray, } resp, err := e.client.generateEmbedding(req) + if err != nil { - return nil, err + return 0, nil, err } - switch inputStruct.EmbeddingType { - case "int8", "uint8", "binary", "ubinary": - bills := resp.EmbeddingsByType.Meta.BilledUnits - outputStruct := embeddingIntOutput{ - Usage: embedUsage{ - Tokens: int(*bills.InputTokens), - }, - } - switch inputStruct.EmbeddingType { - case "int8": - outputStruct.Embedding = resp.EmbeddingsByType.Embeddings.Int8[0] - case "uint8": - outputStruct.Embedding = resp.EmbeddingsByType.Embeddings.Uint8[0] - case "binary": - outputStruct.Embedding = resp.EmbeddingsByType.Embeddings.Binary[0] - case "ubinary": - outputStruct.Embedding = resp.EmbeddingsByType.Embeddings.Ubinary[0] - } - output, err := base.ConvertToStructpb(outputStruct) - if err != nil { - return nil, err - } - return output, nil - case "float": - bills := resp.EmbeddingsByType.Meta.BilledUnits - outputStruct := embeddingFloatOutput{ - Usage: embedUsage{ - Tokens: int(*bills.InputTokens), - }, - Embedding: resp.EmbeddingsByType.Embeddings.Float[0], - } - output, err := base.ConvertToStructpb(outputStruct) - if err != nil { - return nil, err - } - return output, nil - default: - bills := resp.EmbeddingsFloats.Meta.BilledUnits - outputStruct := embeddingFloatOutput{ - Usage: embedUsage{ - Tokens: int(*bills.InputTokens), - }, - Embedding: resp.EmbeddingsFloats.Embeddings[0], - } - output, err := base.ConvertToStructpb(outputStruct) - if err != nil { - return nil, err - } - return output, nil + embeddingResult := getFloatEmbedding(resp, inputStruct.EmbeddingType) + + return getBillingTokens(resp, inputStruct.EmbeddingType), embeddingResult, nil + +} + +func getIntEmbedding(resp cohereSDK.EmbedResponse, embeddingType string) ([]int, error) { + switch embeddingType { + case "int8": + return resp.EmbeddingsByType.Embeddings.Int8[0], nil + case "uint8": + return resp.EmbeddingsByType.Embeddings.Uint8[0], nil + case "binary": + return resp.EmbeddingsByType.Embeddings.Binary[0], nil + case "ubinary": + return resp.EmbeddingsByType.Embeddings.Ubinary[0], nil + } + return nil, fmt.Errorf("invalid embedding type: %s", embeddingType) +} + +func getFloatEmbedding(resp cohereSDK.EmbedResponse, embeddingType string) []float64 { + if embeddingType == "float" { + return resp.EmbeddingsByType.Embeddings.Float[0] + } else { + return resp.EmbeddingsFloats.Embeddings[0] + } +} + +func getBillingTokens(resp cohereSDK.EmbedResponse, embeddingType string) int { + if IsEmbeddingOutputInt(embeddingType) || embeddingType == "float" { + return int(*resp.EmbeddingsByType.Meta.BilledUnits.InputTokens) + } else { + return int(*resp.EmbeddingsFloats.Meta.BilledUnits.InputTokens) } } diff --git a/ai/cohere/v0/main.go b/ai/cohere/v0/main.go index 55ece68c..106cbb69 100644 --- a/ai/cohere/v0/main.go +++ b/ai/cohere/v0/main.go @@ -75,7 +75,7 @@ func (c *component) CreateExecution(sysVars map[string]any, setup *structpb.Stru } e := &execution{ ComponentExecution: base.ComponentExecution{Component: c, SystemVariables: sysVars, Task: task, Setup: resolvedSetup}, - client: newClient(getAPIKey(setup), c.GetLogger()), + client: newClient(getAPIKey(resolvedSetup), c.GetLogger()), usesInstillCredentials: resolved, } switch task { diff --git a/ai/cohere/v0/rerank.go b/ai/cohere/v0/rerank.go index bcecb24c..be77c07c 100644 --- a/ai/cohere/v0/rerank.go +++ b/ai/cohere/v0/rerank.go @@ -8,13 +8,13 @@ import ( "google.golang.org/protobuf/types/known/structpb" ) -type rerankInput struct { +type RerankInput struct { Query string `json:"query"` Documents []string `json:"documents"` ModelName string `json:"model-name"` } -type rerankOutput struct { +type RerankOutput struct { Ranking []string `json:"ranking"` Usage rerankUsage `json:"usage"` Relevance []float64 `json:"relevance"` @@ -26,7 +26,7 @@ type rerankUsage struct { func (e *execution) taskRerank(in *structpb.Struct) (*structpb.Struct, error) { - inputStruct := rerankInput{} + inputStruct := RerankInput{} err := base.ConvertFromStructpb(in, &inputStruct) if err != nil { return nil, fmt.Errorf("error generating input struct: %v", err) @@ -61,7 +61,7 @@ func (e *execution) taskRerank(in *structpb.Struct) (*structpb.Struct, error) { } bills := resp.Meta.BilledUnits - outputStruct := rerankOutput{ + outputStruct := RerankOutput{ Ranking: newRanking, Usage: rerankUsage{Search: int(*bills.SearchUnits)}, Relevance: relevance, diff --git a/ai/cohere/v0/text_generation.go b/ai/cohere/v0/text_generation.go index 350feb6c..60b7efe4 100644 --- a/ai/cohere/v0/text_generation.go +++ b/ai/cohere/v0/text_generation.go @@ -22,7 +22,7 @@ type MultiModalContent struct { Type string `json:"type"` } -type textGenerationInput struct { +type TextGenerationInput struct { ChatHistory []ChatMessage `json:"chat-history"` MaxNewTokens int `json:"max-new-tokens"` ModelName string `json:"model-name"` @@ -46,7 +46,7 @@ type commandUsage struct { OutputTokens int `json:"output-tokens"` } -type textGenerationOutput struct { +type TextGenerationOutput struct { Text string `json:"text"` Citations []citation `json:"citations"` Usage commandUsage `json:"usage"` @@ -54,7 +54,7 @@ type textGenerationOutput struct { func (e *execution) taskTextGeneration(in *structpb.Struct) (*structpb.Struct, error) { - inputStruct := textGenerationInput{} + inputStruct := TextGenerationInput{} err := base.ConvertFromStructpb(in, &inputStruct) if err != nil { return nil, fmt.Errorf("error generating input struct: %v", err) @@ -123,7 +123,7 @@ func (e *execution) taskTextGeneration(in *structpb.Struct) (*structpb.Struct, e inputTokens := *bills.InputTokens outputTokens := *bills.OutputTokens - outputStruct := textGenerationOutput{ + outputStruct := TextGenerationOutput{ Text: resp.Text, Citations: citations, Usage: commandUsage{ diff --git a/store/store.go b/store/store.go index fd5e8d70..2a2ee852 100644 --- a/store/store.go +++ b/store/store.go @@ -108,7 +108,13 @@ func Init( conn = conn.WithInstillCredentials(secrets[conn.GetDefinitionID()]) compStore.Import(conn) } - compStore.Import(cohere.Init(baseComp)) + { + // Cohere + conn := cohere.Init(baseComp) + conn = conn.WithInstillCredentials(secrets[conn.GetDefinitionID()]) + compStore.Import(conn) + } + compStore.Import(archetypeai.Init(baseComp)) compStore.Import(numbers.Init(baseComp)) compStore.Import(bigquery.Init(baseComp)) From cba4aacdda1bca8fe676f299248c04547996c828 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Vall=C3=A9s?= <3977183+jvallesm@users.noreply.github.com> Date: Mon, 15 Jul 2024 03:36:47 +0200 Subject: [PATCH 4/4] feat: add JQ input field that accepts any type (#201) Because - The `jq` task took only strings, which can be leveraged for reference but forced users to pre-process objects with another JSON operator that marshalled the input. This commit - Introduces a new input field for the task, which can be any object - The old value is hidden under the "More" settings section. If the new field is empty, task will fall back to the old field (for backwards compatibility). - Removes object type assumption in unmarshal task: now it can unmarshal any valid JSON value. --- operator/json/v0/README.mdx | 3 +- operator/json/v0/component_test.go | 74 ++++++++++++++++++++++++------ operator/json/v0/config/tasks.json | 27 +++++++++-- operator/json/v0/main.go | 19 ++++---- 4 files changed, 96 insertions(+), 27 deletions(-) diff --git a/operator/json/v0/README.mdx b/operator/json/v0/README.mdx index 2a884a27..265f1515 100644 --- a/operator/json/v0/README.mdx +++ b/operator/json/v0/README.mdx @@ -80,8 +80,9 @@ Process JSON through a `jq` command | Input | ID | Type | Description | | :--- | :--- | :--- | :--- | | Task ID (required) | `task` | string | `TASK_JQ` | -| JSON input (required) | `jsonInput` | string | JSON string to be processed | +| JSON value | `json-value` | any | JSON value (e.g. string, number, object, array...) to be processed by the filter | | Filter (required) | `jqFilter` | string | Filter, in `jq` syntax, that will be applied to the JSON input | +| (DEPRECATED) JSON input | `jsonInput` | string | (DEPRECATED, use 'JSON value' instead) JSON string to be processed. This field allows templated inputs, but the data might preprocessing (marshalling). This field will be used in absence of 'JSON value' for backwards compatibility reasons. | diff --git a/operator/json/v0/component_test.go b/operator/json/v0/component_test.go index e14b0f2e..d8df27e2 100644 --- a/operator/json/v0/component_test.go +++ b/operator/json/v0/component_test.go @@ -40,12 +40,19 @@ func TestOperator_Execute(t *testing.T) { wantJSON json.RawMessage }{ { - name: "ok - marshal", + name: "ok - marshal object", task: taskMarshal, in: map[string]any{"json": asMap}, wantJSON: json.RawMessage(asJSON), }, + { + name: "ok - marshal array", + + task: taskMarshal, + in: map[string]any{"json": []any{false, true, "dos", 3}}, + wantJSON: json.RawMessage(`[false, true, "dos", 3]`), + }, { name: "nok - marshal", @@ -60,6 +67,20 @@ func TestOperator_Execute(t *testing.T) { in: map[string]any{"string": asJSON}, want: map[string]any{"json": asMap}, }, + { + name: "ok - unmarshal array", + + task: taskUnmarshal, + in: map[string]any{"string": `[false, true, "dos", 3]`}, + want: map[string]any{"json": []any{false, true, "dos", 3}}, + }, + { + name: "ok - unmarshal string", + + task: taskUnmarshal, + in: map[string]any{"string": `"foo"`}, + want: map[string]any{"json": "foo"}, + }, { name: "nok - unmarshal", @@ -68,7 +89,7 @@ func TestOperator_Execute(t *testing.T) { wantErr: "Couldn't parse the JSON string. Please check the syntax is correct.", }, { - name: "ok - jq", + name: "ok - jq from string", task: taskJQ, in: map[string]any{ @@ -80,28 +101,55 @@ func TestOperator_Execute(t *testing.T) { }, }, { - name: "ok - jq create object", + name: "nok - jq invalid JSON string", + + task: taskJQ, + in: map[string]any{ + "jsonInput": "{", + "jqFilter": ".", + }, + wantErr: "Couldn't parse the JSON input. Please check the syntax is correct.", + }, + { + name: "ok - string value", task: taskJQ, in: map[string]any{ - "jsonInput": `{"id": "sample", "10": {"b": 42}}`, - "jqFilter": `{(.id): .["10"].b}`, + "json-value": "foo", + "jqFilter": `. + "bar"`, }, want: map[string]any{ - "results": []any{ - map[string]any{"sample": 42}, - }, + "results": []any{"foobar"}, }, }, { - name: "nok - jq invalid JSON input", + name: "ok - from array", task: taskJQ, in: map[string]any{ - "jsonInput": "{", - "jqFilter": ".", + "json-value": []any{2, 3, 23}, + "jqFilter": ".[2]", + }, + want: map[string]any{ + "results": []any{23}, + }, + }, + { + name: "ok - jq create object", + + task: taskJQ, + in: map[string]any{ + "json-value": map[string]any{ + "id": "sample", + "10": map[string]any{"b": 42}, + }, + "jqFilter": `{(.id): .["10"].b}`, + }, + want: map[string]any{ + "results": []any{ + map[string]any{"sample": 42}, + }, }, - wantErr: "Couldn't parse the JSON input. Please check the syntax is correct.", }, { name: "nok - jq invalid filter", @@ -138,7 +186,7 @@ func TestOperator_Execute(t *testing.T) { if tc.wantJSON != nil { // Check JSON in the output string. b := got[0].Fields["string"].GetStringValue() - c.Check([]byte(b), qt.JSONEquals, tc.wantJSON) + c.Check([]byte(b), qt.JSONEquals, tc.wantJSON, qt.Commentf(string(b)+" vs "+string(tc.wantJSON))) return } diff --git a/operator/json/v0/config/tasks.json b/operator/json/v0/config/tasks.json index ee166b2a..6ae66ac2 100644 --- a/operator/json/v0/config/tasks.json +++ b/operator/json/v0/config/tasks.json @@ -115,8 +115,9 @@ "instillUIOrder": 0, "properties": { "jsonInput": { - "instillUIOrder": 0, - "description": "JSON string to be processed", + "instillUIOrder": 2, + "description": "(DEPRECATED, use 'JSON value' instead) JSON string to be processed. This field allows templated inputs, but the data might preprocessing (marshalling). This field will be used in absence of 'JSON value' for backwards compatibility reasons.", + "instillShortDescription": "(DEPRECATED) JSON string to be processed", "instillAcceptFormats": [ "string" ], @@ -126,9 +127,24 @@ "template" ], "instillUIMultiline": true, - "title": "JSON input", + "title": "(DEPRECATED) JSON input", "type": "string" }, + "json-value": { + "instillUIOrder": 0, + "description": "JSON value (e.g. string, number, object, array...) to be processed by the filter", + "instillAcceptFormats": [ + "object", + "structured/*", + "semi-structured/*" + ], + "instillUpstreamTypes": [ + "value", + "reference" + ], + "instillUIMultiline": true, + "title": "JSON value" + }, "jqFilter": { "instillUIOrder": 1, "description": "Filter, in `jq` syntax, that will be applied to the JSON input", @@ -146,7 +162,10 @@ } }, "required": [ - "jsonInput", + "jqFilter" + ], + "instillEditOnNodeFields": [ + "json-value", "jqFilter" ], "title": "Input", diff --git a/operator/json/v0/main.go b/operator/json/v0/main.go index 8ecc8b3d..e938528f 100644 --- a/operator/json/v0/main.go +++ b/operator/json/v0/main.go @@ -3,11 +3,12 @@ package json import ( "context" - _ "embed" "encoding/json" "fmt" "sync" + _ "embed" + "github.com/itchyny/gojq" "google.golang.org/protobuf/encoding/protojson" "google.golang.org/protobuf/types/known/structpb" @@ -94,14 +95,12 @@ func (e *execution) unmarshal(in *structpb.Struct) (*structpb.Struct, error) { out := new(structpb.Struct) b := []byte(in.Fields["string"].GetStringValue()) - obj := new(structpb.Struct) + obj := new(structpb.Value) if err := protojson.Unmarshal(b, obj); err != nil { return nil, errmsg.AddMessage(err, "Couldn't parse the JSON string. Please check the syntax is correct.") } - out.Fields = map[string]*structpb.Value{ - "json": structpb.NewStructValue(obj), - } + out.Fields = map[string]*structpb.Value{"json": obj} return out, nil } @@ -109,10 +108,12 @@ func (e *execution) unmarshal(in *structpb.Struct) (*structpb.Struct, error) { func (e *execution) jq(in *structpb.Struct) (*structpb.Struct, error) { out := new(structpb.Struct) - b := []byte(in.Fields["jsonInput"].GetStringValue()) - var input any - if err := json.Unmarshal(b, &input); err != nil { - return nil, errmsg.AddMessage(err, "Couldn't parse the JSON input. Please check the syntax is correct.") + input := in.Fields["json-value"].AsInterface() + if input == nil { + b := []byte(in.Fields["jsonInput"].GetStringValue()) + if err := json.Unmarshal(b, &input); err != nil { + return nil, errmsg.AddMessage(err, "Couldn't parse the JSON input. Please check the syntax is correct.") + } } queryStr := in.Fields["jqFilter"].GetStringValue()