-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcalculate.ts
34 lines (31 loc) · 976 Bytes
/
calculate.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import { getEncoding } from "./deps.ts";
type TiktokenEncoding =
| "cl100k_base"
| "gpt2"
| "r50k_base"
| "p50k_base"
| "p50k_edit";
const DEFAULT_ENCODING: TiktokenEncoding = "cl100k_base";
/**
* Basic implementation to calculates the number of tokens in a given text string based on the specified encoding.
*
* @param {string} text - The text to be tokenized.
* @param {TiktokenEncoding} [encoding="cl100k_base"] - The `Tiktoken` encoding model to use for tokenization.
* @returns {number} The number of tokens in the text.
*
* @example
* ```ts
* import { calculateTokenCount } from "@zac/safe-inbed";
*
* console.log("Token count:", calculateTokenCount("Hello, world!"));
* // Output: Token count: 4
* ```
*/
export const calculateTokenCount = (
text: string,
encoding: TiktokenEncoding = DEFAULT_ENCODING,
): number => {
const encodingModel = getEncoding(encoding);
const tokens = encodingModel.encode(text);
return tokens.length;
};