-
Notifications
You must be signed in to change notification settings - Fork 39
/
token.go
79 lines (67 loc) · 2.18 KB
/
token.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
package sentences
import (
"fmt"
"regexp"
)
// TokenGrouper two adjacent tokens together.
type TokenGrouper interface {
Group([]*Token) [][2]*Token
}
// DefaultTokenGrouper is the default implementation of TokenGrouper
type DefaultTokenGrouper struct{}
// Group is the primary logic for implementing TokenGrouper
func (p *DefaultTokenGrouper) Group(tokens []*Token) [][2]*Token {
if len(tokens) == 0 {
return nil
}
pairTokens := make([][2]*Token, 0, len(tokens))
prevToken := tokens[0]
for _, tok := range tokens {
if prevToken == tok {
continue
}
pairTokens = append(pairTokens, [2]*Token{prevToken, tok})
prevToken = tok
}
pairTokens = append(pairTokens, [2]*Token{prevToken, nil})
return pairTokens
}
// Token stores a token of text with annotations produced during sentence boundary detection.
type Token struct {
Tok string
Position int
SentBreak bool
ParaStart bool
LineStart bool
Abbr bool
periodFinal bool
reEllipsis *regexp.Regexp
reNumeric *regexp.Regexp
reInitial *regexp.Regexp
reListNumber *regexp.Regexp
reAlpha *regexp.Regexp
reCoordinateSecondPart *regexp.Regexp
}
var reEllipsis = regexp.MustCompile(`\.\.+$`)
var reNumeric = regexp.MustCompile(`-?[\.,]?\d[\d,\.-]*\.?$`)
var reInitial = regexp.MustCompile(`^[A-Za-z]\.$`)
var reListNumber = regexp.MustCompile(`^\d+.?\)?$`)
var reAlpha = regexp.MustCompile(`^[A-Za-z]+$`)
var reCoordinateSecondPart = regexp.MustCompile(`^[0-9]*\.[0-9]*\.[0-9]*\.$`)
// NewToken is the default implementation of the Token struct
func NewToken(token string) *Token {
tok := Token{
Tok: token,
reEllipsis: reEllipsis,
reNumeric: reNumeric,
reInitial: reInitial,
reListNumber: reListNumber,
reAlpha: reAlpha,
reCoordinateSecondPart: reCoordinateSecondPart,
}
return &tok
}
// String is the string representation of Token
func (p *Token) String() string {
return fmt.Sprintf("<Token Tok: %q, SentBreak: %t, Abbr: %t, Position: %d>", p.Tok, p.SentBreak, p.Abbr, p.Position)
}