Skip to content

Commit

Permalink
Create PreTokenizedString.IntoEncoding
Browse files Browse the repository at this point in the history
  • Loading branch information
marco-nicola committed Dec 12, 2020
1 parent 265892f commit e5c78e8
Showing 1 changed file with 66 additions and 0 deletions.
66 changes: 66 additions & 0 deletions pretokenizedstring/pretokenizedstring.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
package pretokenizedstring

import (
"fmt"
"github.com/nlpodyssey/gotokenizers/encodings"
"github.com/nlpodyssey/gotokenizers/models"
"github.com/nlpodyssey/gotokenizers/normalizedstring"
"github.com/nlpodyssey/gotokenizers/strutils"
Expand Down Expand Up @@ -156,3 +158,67 @@ func (p *PreTokenizedString) GetNormalizedByteSplits() []NormalizedByteSplit {
func (p *PreTokenizedString) Splits() []Split {
return p.splits
}

// IntoEncoding transforms the current PreTokenizedString into an
// encodings.Encoding.
//
// If a wordIndex is provided (i.e. >= 0), any word in the generated Encoding
// will be set to this value. This is generally used with pre-tokenized
// input, that does not need the PreTokenizedString to generate word ids.
//
// This method will fail if some splits do not have associated Token.
//
// Offset indices are based on bytes (not runes).
func (p *PreTokenizedString) IntoEncoding(wordIndex int, typeID int) (*encodings.Encoding, error) {
if len(p.splits) == 0 {
return encodings.NewDefaultEncoding(), nil
}
if !p.allSplitsHaveTokens() {
return nil, fmt.Errorf("splits have not been tokenized, call `PreTokenizedString.Tokenize` first")
}

sequence := make([]encodings.EncodableToken, 0)

for splitIndex, split := range p.splits {
nsOffsets := split.NormalizedString.OriginalOffsets()

actualWordIndex := wordIndex
if actualWordIndex < 0 {
actualWordIndex = splitIndex
}

for _, token := range *split.Tokens {
var offsets strutils.ByteOffsets

tokenOrigRange, ok := split.NormalizedString.CoerceRangeToOriginal(
normalizedstring.NewNormalizedRange(token.Offsets.Start, token.Offsets.End))
if ok {
offsets = strutils.ByteOffsets{
Start: nsOffsets.Start + tokenOrigRange.Start(),
End: nsOffsets.Start + tokenOrigRange.End(),
}
} else {
offsets = token.Offsets
}

sequence = append(sequence, encodings.EncodableToken{
ID: token.ID,
Token: token.Value,
Offsets: offsets,
WordIndex: actualWordIndex,
TypeID: typeID,
})
}
}

return encodings.EncodingFromEncodableTokens(sequence), nil
}

func (p *PreTokenizedString) allSplitsHaveTokens() bool {
for _, split := range p.splits {
if split.Tokens == nil {
return false
}
}
return true
}

0 comments on commit e5c78e8

Please sign in to comment.