-
-
Notifications
You must be signed in to change notification settings - Fork 197
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge remote-tracking branch 'origin/develop'
- Loading branch information
Showing
18 changed files
with
41,485 additions
and
16 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,6 @@ | ||
[submodule "_tools/pinyin-data"] | ||
path = _tools/pinyin-data | ||
url = https://github.com/mozillazg/pinyin-data.git | ||
[submodule "_tools/phrase-data"] | ||
path = _tools/phrase-data | ||
url = https://github.com/hotoo/pinyin.git |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,28 +1,22 @@ | ||
language: go | ||
go: | ||
- '1.7' | ||
- '1.8' | ||
- '1.9.x' | ||
- '1.10.x' | ||
- '1.11.x' | ||
- '1.12.x' | ||
- 'master' | ||
- "1.9.x" | ||
- "1.10.x" | ||
- "1.11.x" | ||
- "1.12.x" | ||
- "1.13.x" | ||
|
||
sudo: false | ||
|
||
before_install: | ||
- if ! go get code.google.com/p/go.tools/cmd/cover; then go get golang.org/x/tools/cmd/cover; fi | ||
- go get github.com/mattn/go-isatty | ||
- go get github.com/axw/gocov/gocov | ||
- go get github.com/mattn/goveralls | ||
- go get ./cmd/pinyin | ||
|
||
script: | ||
- go run cmd/pinyin/main.go abc | ||
- go run cmd/pinyin/main.go -s zhao abc | ||
- echo "abc" | go run cmd/pinyin/main.go | ||
- echo "abc" > abc.txt && go run cmd/pinyin/main.go < abc.txt | ||
- $HOME/gopath/bin/goveralls -service=travis-ci -v -package . | ||
|
||
matrix: | ||
allow_failures: | ||
- go: master |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
package main | ||
|
||
import ( | ||
"bufio" | ||
"flag" | ||
"fmt" | ||
"io" | ||
"os" | ||
"strings" | ||
) | ||
|
||
type cmdArgs struct { | ||
inputFile string | ||
outputFile string | ||
} | ||
|
||
func genCode(inFile *os.File, outFile *os.File) { | ||
rd := bufio.NewReader(inFile) | ||
output := `package pinyin | ||
// phraseDict is data map | ||
// | ||
// Generate from: | ||
// https://github.com/hotoo/pinyin/blob/master/data/phrases-dict.js | ||
// | ||
// Warning: Auto-generated file, don't edit. | ||
// If you want add more words, use phrase_dict_addition.go | ||
var phraseDict = map[string]string{ | ||
` | ||
lines := []string{} | ||
|
||
for { | ||
line, err := rd.ReadString('\n') | ||
if err == io.EOF { | ||
break | ||
} else if err != nil { | ||
panic(err) | ||
} | ||
|
||
// Remove prefix space | ||
line = strings.TrimSpace(line) | ||
|
||
// `"后来居上": [["hòu"], ["lái"], ["jū"], ["shàng"]],` to `"后来居上": "hòu lái jū shàng",` | ||
if !strings.HasPrefix(line, `"`) { | ||
continue | ||
} | ||
|
||
line = strings.ReplaceAll(line, `[`, "") | ||
line = strings.ReplaceAll(line, `]`, "") | ||
line = strings.ReplaceAll(line, `", "`, " ") | ||
|
||
lines = append(lines, line) | ||
} | ||
|
||
output += strings.Join(lines, "\n") | ||
output += "\n}\n" | ||
outFile.WriteString(output) | ||
return | ||
} | ||
|
||
func parseCmdArgs() cmdArgs { | ||
flag.Parse() | ||
inputFile := flag.Arg(0) | ||
outputFile := flag.Arg(1) | ||
return cmdArgs{inputFile, outputFile} | ||
} | ||
|
||
func main() { | ||
args := parseCmdArgs() | ||
usage := "gen_phrase_dict INPUT OUTPUT" | ||
inputFile := args.inputFile | ||
outputFile := args.outputFile | ||
if inputFile == "" || outputFile == "" { | ||
fmt.Println(usage) | ||
os.Exit(1) | ||
} | ||
|
||
inFp, err := os.Open(inputFile) | ||
if err != nil { | ||
fmt.Printf("open file %s error", inputFile) | ||
panic(err) | ||
} | ||
outFp, err := os.Create(outputFile) | ||
if err != nil { | ||
fmt.Printf("open file %s error", outputFile) | ||
panic(err) | ||
} | ||
defer inFp.Close() | ||
defer outFp.Close() | ||
|
||
genCode(inFp, outFp) | ||
} |
File renamed without changes.
Submodule phrase-data
added at
f2b37a
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,6 @@ | ||
module github.com/mozillazg/go-pinyin | ||
|
||
require ( | ||
github.com/mattn/go-isatty v0.0.10 | ||
github.com/yanyiwu/gojieba v1.1.0 | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
github.com/mattn/go-isatty v0.0.10 h1:qxFzApOv4WsAL965uUPIsXzAKCZxN2p9UqdhFS4ZW10= | ||
github.com/mattn/go-isatty v0.0.10/go.mod h1:qgIWMr58cqv1PHHyhnkY9lrL7etaEgOFcMEpPG5Rm84= | ||
github.com/yanyiwu/gojieba v1.1.0 h1:rx+kNP0L7zn+4Zyo1d8aLCfXt3BmIkWQ26FCbGnw1bc= | ||
github.com/yanyiwu/gojieba v1.1.0/go.mod h1:0AAj9tOG6WWXQ5FNffl4ruBy/hP7bHl2gs+YiDi1aYs= | ||
golang.org/x/sys v0.0.0-20191008105621-543471e840be h1:QAcqgptGM8IQBC9K/RC4o+O9YmqEm0diQn9QmZw/0mU= | ||
golang.org/x/sys v0.0.0-20191008105621-543471e840be/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
package pinyin | ||
|
||
import ( | ||
"regexp" | ||
"strings" | ||
"unicode" | ||
) | ||
|
||
var ( | ||
splacesRegexp = regexp.MustCompile(`[\s]+`) | ||
allowCharsRegexp = regexp.MustCompile(`[a-zA-Z0-9\.,\?\!;\(\)\[\]\&\=\-_@\s]`) | ||
hansSymbols = map[string]string{ | ||
"?": "?", | ||
"!": "!", | ||
":": ":", | ||
"。": ".", | ||
",": ",", | ||
";": ";", | ||
"(": "(", | ||
")": ")", | ||
"【": "[", | ||
"】": "]", | ||
} | ||
paragraphOption = Args{ | ||
Style: NORMAL, | ||
Heteronym: true, | ||
} | ||
) | ||
|
||
// Paragraph convert a Chinese paragraph into pinyin, including letters, numbers, symbols | ||
func Paragraph(p string) (s string) { | ||
p = pinyinPhrase(p) | ||
|
||
for _, r := range p { | ||
if unicode.Is(unicode.Han, r) { | ||
// Han chars | ||
result := Pinyin(string(r), paragraphOption) | ||
if len(result) == 0 { | ||
continue | ||
} | ||
if len(result[0]) == 0 { | ||
continue | ||
} | ||
|
||
s += " " + string(result[0][0]) + " " | ||
} else { | ||
// Other chars | ||
char := string(r) | ||
|
||
if allowCharsRegexp.MatchString(char) { | ||
s += char | ||
} else { | ||
if hansSymbols[char] != "" { | ||
s += hansSymbols[char] | ||
} | ||
} | ||
} | ||
} | ||
|
||
// 去连续两个空格 | ||
s = splacesRegexp.ReplaceAllString(s, " ") | ||
// 去掉 , . ? 前面的空格 | ||
s = strings.Replace(s, " ,", ",", -1) | ||
s = strings.Replace(s, " .", ".", -1) | ||
s = strings.Replace(s, " ?", "?", -1) | ||
s = strings.Replace(s, " ;", ";", -1) | ||
s = strings.Replace(s, " !", "!", -1) | ||
s = strings.Replace(s, "( ", "(", -1) | ||
s = strings.Replace(s, " )", ")", -1) | ||
s = strings.Replace(s, "[ ", "[", -1) | ||
s = strings.Replace(s, " ]", "]", -1) | ||
s = strings.Replace(s, " :", ":", -1) | ||
s = strings.TrimSpace(s) | ||
return | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
package pinyin | ||
|
||
import ( | ||
"testing" | ||
) | ||
|
||
func BenchmarkParagraph(b *testing.B) { | ||
for i := 0; i < b.N; i++ { | ||
// about 0.06ms/op | ||
Paragraph("这条恶狗真可恶 满身臭味 让人闻了就恶心 让人厌恶 像恶魔让人做恶梦") | ||
} | ||
} | ||
|
||
func TestParagraph(t *testing.T) { | ||
expects := map[string]string{ | ||
"天府大道北段18号高新国际广场A-3号!": "tian fu da dao bei duan 18 hao gao xin guo ji guang chang A-3 hao!", | ||
"人民银行旁边一行人abc字母路牌,平行宇宙发行股票": "ren min yin xing pang bian yi xing ren abc zi mu lu pai, ping xing yu zhou fa xing gu piao", | ||
"我的大王!": "wo de dai wang!", | ||
"A abc&1234个人. 银行旁边,一行人?": "A abc&1234 ge ren. yin xing pang bian, yi xing ren?", | ||
"A abc&1234个人,中国(银行)旁边;一“行”人?": "A abc&1234 ge ren, zhong guo (yin xing) pang bian; yi xing ren?", | ||
"字符串长桥: aBc多音字&123": "zi fu chuan chang qiao aBc duo yin zi &123", | ||
"【腾讯(00700)拟2.305亿元出售应收账款 赚1720万元】": "[teng xun (00700) ni 2.305 yi yuan chu shou ying shou zhang kuan zhuan 1720 wan yuan]", | ||
"地址:重庆市江北区重工业?": "di zhi: chong qing shi jiang bei qu zhong gong ye?", | ||
"交给团长,告诉他我们给予期望。前线的供给一定要能自给自足!": "jiao gei tuan zhang, gao su ta wo men ji yu qi wang. qian xian de gong ji yi ding yao neng zi ji zi zu!", | ||
"abc123": "abc123", | ||
"義灬骉驫芔淼㴇": "yi biao biao biao hui miao she", | ||
"": "", | ||
} | ||
|
||
for source, expect := range expects { | ||
actual := Paragraph(source) | ||
if expect != actual { | ||
t.Errorf("\nexpect: %s\nactual: %s\n", expect, actual) | ||
break | ||
} | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
package pinyin | ||
|
||
import ( | ||
"strings" | ||
|
||
"github.com/yanyiwu/gojieba" | ||
) | ||
|
||
var ( | ||
jieba = gojieba.NewJieba() | ||
) | ||
|
||
func cutWords(s string) []string { | ||
return jieba.CutAll(s) | ||
} | ||
|
||
func pinyinPhrase(s string) string { | ||
words := cutWords(s) | ||
for _, word := range words { | ||
match := phraseDict[word] | ||
if match == "" { | ||
match = phraseDictAddition[word] | ||
} | ||
|
||
match = toFixed(match, paragraphOption) | ||
if match != "" { | ||
s = strings.Replace(s, word, " "+match+" ", 1) | ||
} | ||
} | ||
|
||
return s | ||
} | ||
|
Oops, something went wrong.