word_tokenizer.go

package sentences

import (
	"regexp"
	"strings"
	"unicode"
)

// WordTokenizer is the primary interface for tokenizing words
type WordTokenizer interface {
	TokenParser
	Tokenize(string, bool) []*Token
}

// TokenType are helpers to get the type of a token
type TokenType interface {
	Type(*Token) string
	// The type with its final period removed if it has one.
	TypeNoPeriod(*Token) string
	// The type with its final period removed if it is marked as a sentence break.
	TypeNoSentPeriod(*Token) string
}

// TokenFirst are helpers to determine the case of the token's first letter
type TokenFirst interface {
	// True if the token's first character is lowercase
	FirstLower(*Token) bool
	// True if the token's first character is uppercase.
	FirstUpper(*Token) bool
}

// TokenExistential are helpers to determine what type of token we are dealing with.
type TokenExistential interface {
	// True if the token text is all alphabetic.
	IsAlpha(*Token) bool
	// True if the token text is that of an ellipsis.
	IsEllipsis(*Token) bool
	// True if the token text is that of an initial.
	IsInitial(*Token) bool
	// True if the token text is that of an number as part of a list.
	IsListNumber(*Token) bool
	// True if the token text is that of a number.
	IsNumber(*Token) bool
	// True if the token is either a number or is alphabetic.
	IsNonPunct(*Token) bool
	// True if the token is first part of a coordinate.
	IsCoordinatePartOne(*Token) bool
	// True if the token is second part of a coordinate.
	IsCoordinatePartTwo(*Token) bool
	// Does this token end with a period?
	HasPeriodFinal(*Token) bool
	// Does this token end with a punctuation and a quote?
	HasSentEndChars(*Token) bool
	// Does this token end with ambigiuous punctuation?
	HasUnreliableEndChars(*Token) bool
}

// TokenParser is the primary token interface that determines the context and type of a tokenized word.
type TokenParser interface {
	TokenType
	TokenFirst
	TokenExistential
}

// DefaultWordTokenizer is the default implementation of the WordTokenizer
type DefaultWordTokenizer struct {
	PunctStrings
}

// NewWordTokenizer creates a new DefaultWordTokenizer
func NewWordTokenizer(p PunctStrings) *DefaultWordTokenizer {
	return &DefaultWordTokenizer{p}
}

// Tokenize breaks text into words while preserving their character position, whether it starts
// a new line, and new paragraph.
func (p *DefaultWordTokenizer) Tokenize(text string, onlyPeriodContext bool) []*Token {
	textLength := len(text)

	if textLength == 0 {
		return nil
	}

	tokens := make([]*Token, 0, 50)
	lastSpace := 0
	lineStart := false
	paragraphStart := false
	getNextWord := false

	for i, char := range text {
		if !unicode.IsSpace(char) && !IsCjkPunct(char) && i != textLength-1 {
			continue
		}

		if IsCjkPunct(char) {
			i += len(string(char))
		}

		if char == '\n' {
			if lineStart {
				paragraphStart = true
			}
			lineStart = true
		}

		var cursor int
		if i == textLength-1 {
			cursor = textLength
		} else {
			cursor = i
		}

		word := strings.TrimSpace(text[lastSpace:cursor])

		if word == "" {
			continue
		}

		hasSentencePunct := p.PunctStrings.HasSentencePunct(word)
		if onlyPeriodContext && !hasSentencePunct && !getNextWord {
			lastSpace = cursor
			continue
		}

		token := NewToken(word)
		token.Position = cursor
		token.ParaStart = paragraphStart
		token.LineStart = lineStart
		tokens = append(tokens, token)

		lastSpace = cursor
		lineStart = false
		paragraphStart = false

		if hasSentencePunct {
			getNextWord = true
		} else {
			getNextWord = false
		}
	}

	if len(tokens) == 0 {
		token := NewToken(text)
		token.Position = textLength
		tokens = append(tokens, token)
	}

	return tokens
}

// Type returns a case-normalized representation of the token.
func (p *DefaultWordTokenizer) Type(t *Token) string {
	typ := t.reNumeric.ReplaceAllString(strings.ToLower(t.Tok), "##number##")
	if len(typ) == 1 {
		return typ
	}

	// removing comma from typ
	return strings.Replace(typ, ",", "", -1)
}

// TypeNoPeriod is the type with its final period removed if it has one.
func (p *DefaultWordTokenizer) TypeNoPeriod(t *Token) string {
	typ := p.Type(t)
	if len(typ) > 1 && string(typ[len(typ)-1]) == "." {
		return string(typ[:len(typ)-1])
	}
	return typ
}

// TypeNoSentPeriod is the type with its final period removed if it is marked as a sentence break.
func (p *DefaultWordTokenizer) TypeNoSentPeriod(t *Token) string {
	if p == nil {
		return ""
	}

	if t.SentBreak {
		return p.TypeNoPeriod(t)
	}

	return p.Type(t)
}

// FirstUpper is true if the token's first character is uppercase.
func (p *DefaultWordTokenizer) FirstUpper(t *Token) bool {
	if t.Tok == "" {
		return false
	}

	runes := []rune(t.Tok)
	return unicode.IsUpper(runes[0])
}

// FirstLower is true if the token's first character is lowercase
func (p *DefaultWordTokenizer) FirstLower(t *Token) bool {
	if t.Tok == "" {
		return false
	}

	runes := []rune(t.Tok)
	return unicode.IsLower(runes[0])
}

// IsEllipsis is true if the token text is that of an ellipsis.
func (p *DefaultWordTokenizer) IsEllipsis(t *Token) bool {
	return t.reEllipsis.MatchString(t.Tok)
}

// IsNumber is true if the token text is that of a number.
func (p *DefaultWordTokenizer) IsNumber(t *Token) bool {
	return strings.HasPrefix(t.Tok, "##number##")
}

// IsInitial is true if the token text is that of an initial.
func (p *DefaultWordTokenizer) IsInitial(t *Token) bool {
	return t.reInitial.MatchString(t.Tok)
}

// IsInitial is true if the token text is that of a list number.
func (p *DefaultWordTokenizer) IsListNumber(t *Token) bool {
	return t.reListNumber.MatchString(t.Tok)
}

// IsAlpha is true if the token text is all alphabetic.
func (p *DefaultWordTokenizer) IsAlpha(t *Token) bool {
	return t.reAlpha.MatchString(t.Tok)
}

// IsCoordinatePartTwo is true if the token text might be the second part of a coordiate.
func (p *DefaultWordTokenizer) IsCoordinatePartOne(t *Token) bool {
	return strings.Compare(t.Tok, "N°.") == 0
}

// IsCoordinatePartTwo is true if the token text might be the second part of a coordiate.
func (p *DefaultWordTokenizer) IsCoordinatePartTwo(t *Token) bool {
	return t.reCoordinateSecondPart.MatchString(t.Tok)
}

// IsNonPunct is true if the token is either a number or is alphabetic.
func (p *DefaultWordTokenizer) IsNonPunct(t *Token) bool {
	nonPunct := regexp.MustCompile(p.PunctStrings.NonPunct())
	return nonPunct.MatchString(p.Type(t))
}

// HasPeriodFinal is true if the last character in the word is a period
func (p *DefaultWordTokenizer) HasPeriodFinal(t *Token) bool {
	return strings.HasSuffix(t.Tok, ".") || strings.HasSuffix(t.Tok, "。")
}

// HasSentEndChars finds any punctuation excluding the period final
func (p *DefaultWordTokenizer) HasSentEndChars(t *Token) bool {
	enders := []string{
		`."`, `.'`, `.)`,
		`?`, `?"`, `?'`, `?)`,
		`!`, `!"`, `!'`, `!)`, `!’`, `!”`,
		`。”`, `。’`, `。）`,
		`？`, `？”`, `？’`, `？）`,
		`！`, `！”`, `！’`, `！）`, `！’`, `！”`,
	}

	for _, ender := range enders {
		if strings.HasSuffix(t.Tok, ender) {
			return true
		}
	}

	parens := []string{
		`.[`, `.(`, `."`, `.'`,
		`?[`, `?(`,
		`![`, `!(`,
		`。【`, `。（`, `。”`, `。’`,
		`？【`, `？（`,
		`！【`, `！（`,
	}

	for _, paren := range parens {
		if strings.Index(t.Tok, paren) != -1 {
			return true
		}
	}

	return false
}

// Find any punctuation that might mean the end of a sentence but doesn't have to
func (p *DefaultWordTokenizer) HasUnreliableEndChars(t *Token) bool {
	enders := []string{
		`."`, `.'`, `.)`, `.’`, `.”`,
		`?"`, `?'`, `?)`, `?’`, `?”`,
		`!"`, `!'`, `!)`, `!’`, `!”`,
	}
	for _, ender := range enders {
		if strings.HasSuffix(t.Tok, ender) {
			return true
		}
	}
	return false
}
func IsCjkPunct(r rune) bool {
	switch r {
	case '。', '；', '！', '？':
		return true
	}
	return false
}