Merge pull request #18 from coreweave/rwang.padtoken052324

feat: add default padding token
coreweave · May 31, 2024 · b0e1976 · b0e1976
2 parents bef6e0d + c4d661e
commit b0e1976
Show file tree

Hide file tree

Showing 6 changed files with 169 additions and 43 deletions.
diff --git a/gpt_bpe.go b/gpt_bpe.go
@@ -22,6 +22,7 @@ import (
 const BPE_LRU_SZ = 65536
 const RUNEBUF_SZ = 16384
 const WORDCHAN_SZ = 4096
+const defaultPadTokenString = "[PAD]"
 
 type Token uint16
 type Tokens []Token
@@ -366,6 +367,20 @@ func NewEncoder(vocabId string) (*GPTEncoder, error) {
 		tokenizerSpecialConfig.AddBosToken = true
 		tokenizerSpecialConfig.AddEosToken = true
 	}
+
+	// Add in default pad token if not already set
+	padTokenNotFound := (tokenizerSpecialConfig.PadToken == "" && hfConfig.PadTokenStr == nil)
+	if padTokenNotFound {
+		// Inject the pad token into the encoder to uintmax16,
+		// throw an error if vocab is larger than uintmax16
+		if len(encoderTokens) >= math.MaxInt16 {
+			log.Fatalf("Vocab size is larger than uint16 max, default pad token cannot be added." +
+				"Please specify a pad token in the vocab file.")
+		}
+		encoderTokens[defaultPadTokenString] = math.MaxUint16
+		tokenizerSpecialConfig.PadToken = defaultPadTokenString
+		hfConfig.PadTokenStr = &tokenizerSpecialConfig.PadToken
+	}
 	encoder := &GPTEncoder{
 		encoderTokens,
 		tokensEncoder,

diff --git a/gpt_bpe_test.go b/gpt_bpe_test.go
@@ -1196,6 +1196,48 @@ func TestModelDownloadLlama(t *testing.T) {
 	fmt.Println("All Exists - Looks good.")
 }
 
+func TestGPT2DefaultPadding(t *testing.T) {
+	// GPT2 defines a padding token, we test if it properly gets this token
+	// corresponds to <|padding|> in the vocab
+	assert.Equal(t, gpt2Encoder.PadToken, Token(50257))
+	assert.Equal(t, gpt2Encoder.Encoder["<|padding|>"], Token(50257))
+}
+
+func TestPilePadding(t *testing.T) {
+	// Pile defines a padding token, we test if it properly gets this token
+	// corresponds to <|padding|> in the vocab
+	assert.Equal(t, pileEncoder.PadToken, Token(1))
+	assert.Equal(t, pileEncoder.Encoder["<|padding|>"], Token(1))
+}
+
+func TestClipPadding(t *testing.T) {
+	// CLIP defines a padding token, we test if it properly gets this token
+	// corresponds to <|endoftext|> in the vocab
+	assert.Equal(t, clipEncoder.PadToken, Token(49407))
+	assert.Equal(t, clipEncoder.Encoder["<|endoftext|>"], Token(49407))
+}
+
+func TestNerdstashPadding(t *testing.T) {
+	// Nerdstash defines a padding token, we test if it properly gets this token
+	// corresponds to <|pad|> in the vocab
+	assert.Equal(t, nerdstashV2Encoder.PadToken, Token(0))
+	assert.Equal(t, nerdstashV2Encoder.Encoder["<|pad|>"], Token(0))
+}
+
+func TestLlamaPadding(t *testing.T) {
+	// Llama doesn't define a padding token, we test if it properly defaults to
+	// [PAD] as 65535
+	assert.Equal(t, llama2Encoder.PadToken, Token(65535))
+	assert.Equal(t, llama2Encoder.Encoder["[PAD]"], Token(65535))
+}
+
+func TestMistralPadding(t *testing.T) {
+	// Mistral doesn't define a padding token, we test if it properly defaults to
+	// [PAD] as 65535
+	assert.Equal(t, mistralEncoder.PadToken, Token(65535))
+	assert.Equal(t, mistralEncoder.Encoder["[PAD]"], Token(65535))
+}
+
 func TestModelDownloadFairseq(t *testing.T) {
 	// Koboldai's fairseq models are stored in a different format
 	// it has merges and vocab but no tokenizer.json

diff --git a/resources/data/llama-tokenizer/special_tokens_map.json b/resources/data/llama-tokenizer/special_tokens_map.json
@@ -13,12 +13,11 @@
     "rstrip": false,
     "single_word": false
   },
-  "pad_token": "[PAD]",
   "unk_token": {
     "content": "<unk>",
     "lstrip": false,
     "normalized": true,
     "rstrip": false,
     "single_word": false
   }
-}
+}
diff --git a/resources/resolver.go b/resources/resolver.go
@@ -13,6 +13,7 @@ import (
 	"path"
 	"regexp"
 	"strconv"
+	"strings"
 	"time"
 
 	"github.com/dustin/go-humanize"
@@ -800,11 +801,32 @@ func ResolveConfig(vocabId string, token string) (config *HFConfig,
 // Given a set of resources, resolve the HuggingFace configuration.
 // Used to be able to resolve both embedded and local resources.
 func ResolveHFFromResources(resources *Resources, hfConfig *HFConfig) (*HFConfig, error) {
-	//use interfaces to unmarsal the config file and tokenizer config file
+	// Resolve config and tokenizer config from resources
+	// config.json and tokenizer_config.json
+	hfConfig, err := resolveConfigAndTokenizerConfig(resources, hfConfig)
+	if err != nil {
+		return nil, err
+	}
+
+	// Resolve special tokens and special tokens config from resources
+	// special_tokens_map.json and specials.txt
+	hfConfig, err = resolveSpecialsAndSpecialTokens(resources, hfConfig)
+	if err != nil {
+		return nil, err
+	}
+	return hfConfig, nil
+}
+
+// resolveConfigAndTokenizerConfig
+// Resolve config and tokenizer config from resources.
+// Used to be able to resolve both embedded and local resources.
+// Continuation of ResolveHFFromResources.
+func resolveConfigAndTokenizerConfig(resources *Resources, hfConfig *HFConfig) (*HFConfig, error) {
+	// Use interfaces to unmarshal the config file and tokenizer config file
 	var config interface{}
 	var tokenizerConfig interface{}
-	//if exists, unmarshal config.json and tokenizer_config.json
-	//use getfile to get the file, then unmarshal it
+	// If exists, unmarshal config.json and tokenizer_config.json, else
+	// use GetFile to get the file, then unmarshal it
 	if _, err := resources.GetFile("config.json"); err == nil {
 		if err := json.Unmarshal(*((*resources)["config.json"]).Data, &config); err != nil {
 			fmt.Errorf("Error unmarshalling config.json: %s", err)
@@ -824,12 +846,13 @@ func ResolveHFFromResources(resources *Resources, hfConfig *HFConfig) (*HFConfig
 
 	}
 
-	//check if bos_token is in string, this is the old format pythia has. If not, try to unmarshal to the tokenizerSpecials
+	// Check if bos_token is in string, this is the old format pythia has.
+	// If not, try to unmarshal to the tokenizerSpecials
 	// that llama 2 has, else try mistral format
 	if config != nil || tokenizerConfig != nil {
 		hasReadConfig := false
 		if config != nil {
-			//using interfaces, first check if bos_token is in string format
+			// Using interfaces, first check if bos_token is in string format
 			if bosToken, ok := config.(map[string]interface{})["bos_token"].(string); ok {
 				hfConfig.BosTokenStr = &bosToken
 				if eosToken, ok := config.(map[string]interface{})["eos_token"].(string); ok {
@@ -842,7 +865,7 @@ func ResolveHFFromResources(resources *Resources, hfConfig *HFConfig) (*HFConfig
 			}
 		}
 		if tokenizerConfig != nil && !hasReadConfig {
-			//using interfaces, first check if bos_token is in string format
+			// Using interfaces, first check if bos_token is in string format
 			if bosToken, ok := tokenizerConfig.(map[string]interface{})["bos_token"].(string); ok {
 				hfConfig.BosTokenStr = &bosToken
 				if eosToken, ok := tokenizerConfig.(map[string]interface{})["eos_token"].(string); ok {
@@ -854,7 +877,7 @@ func ResolveHFFromResources(resources *Resources, hfConfig *HFConfig) (*HFConfig
 				hasReadConfig = true
 
 			}
-			//if not, assume llama2 format and try to unmarshal
+			// If not, assume llama2 format and try to unmarshal
 			if !hasReadConfig {
 				cfg := tokenizerConfig.(map[string]interface{})
 				if bosToken, ok := cfg["bos_token"].(map[string]interface{}); ok {
@@ -871,7 +894,7 @@ func ResolveHFFromResources(resources *Resources, hfConfig *HFConfig) (*HFConfig
 					hfConfig.PadTokenStr = &padToken
 				}
 			}
-			//if that doesn't work, assume mistral format
+			// If that doesn't work, assume mistral format
 			if !hasReadConfig {
 				if bosToken, ok := tokenizerConfig.(map[string]interface{})["bos_token"].(string); ok {
 					hfConfig.BosTokenStr = &bosToken
@@ -889,6 +912,46 @@ func ResolveHFFromResources(resources *Resources, hfConfig *HFConfig) (*HFConfig
 	return hfConfig, nil
 }
 
+// resolveSpecialsAndSpecialTokens
+// Resolve special tokens and special tokens config from resources.
+// Used to be able to resolve both embedded and local resources.
+// Continuation of ResolveHFFromResources.
+func resolveSpecialsAndSpecialTokens(resources *Resources, hfConfig *HFConfig) (*HFConfig, error) {
+	// Get specials config from resources
+	// We can only generate specials.json if we have special_tokens_map
+	specialsJson, ok := (*resources)["special_tokens_map.json"]
+	if ok {
+		specialTokens := make(map[string]interface{}, 0)
+		if specialErr := json.Unmarshal(*specialsJson.Data,
+			&specialTokens); specialErr != nil {
+			return nil, specialErr
+		}
+
+		// Try to get pad token from specials if not already set
+		if hfConfig.PadTokenStr == nil {
+			if padToken, ok := specialTokens["pad_token"].(string); ok {
+				hfConfig.PadTokenStr = &padToken
+			}
+		}
+	}
+
+	// Get from specials.json
+	specialsTxt, ok := (*resources)["specials.txt"]
+	if ok {
+		// Treat specials.txt as an array of strings and try to match
+		specials := strings.Split(string(*specialsTxt.Data), "\n")
+		if hfConfig.PadTokenStr == nil {
+			for _, special := range specials {
+				if strings.Contains(strings.ToLower(special), "pad") {
+					hfConfig.PadTokenStr = &special
+					break
+				}
+			}
+		}
+	}
+	return hfConfig, nil
+}
+
 // ResolveVocabId
 // Resolves a vocabulary id to a set of resources, from embedded,
 // local filesystem, or remote.
@@ -902,7 +965,6 @@ func ResolveVocabId(vocabId string, token string) (*HFConfig, *Resources, error)
 			ModelId:     &vocabId,
 			BosTokenStr: &bosText,
 			EosTokenStr: &endOfText,
-			PadTokenStr: &endOfText,
 		}
 		resources := make(Resources, 0)
 

diff --git a/resources/resource_data_js.go b/resources/resource_data_js.go
diff --git a/resources/resources.go b/resources/resources.go
@@ -25,6 +25,7 @@ import (
 //go:embed data/clip-tokenizer/unitrim.json
 //go:embed data/clip-tokenizer/specials.txt
 //go:embed data/clip-tokenizer/special_config.json
+//go:embed data/clip-tokenizer/special_tokens_map.json
 //go:embed data/nerdstash_v1-tokenizer/encoder.json
 //go:embed data/nerdstash_v1-tokenizer/merges.json
 //go:embed data/nerdstash_v1-tokenizer/specials.txt