Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/develop'
Browse files Browse the repository at this point in the history
  • Loading branch information
mozillazg committed Dec 5, 2019
2 parents a1cd0b7 + fea3a56 commit c9122b8
Show file tree
Hide file tree
Showing 18 changed files with 41,485 additions and 16 deletions.
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
[submodule "_tools/pinyin-data"]
path = _tools/pinyin-data
url = https://github.com/mozillazg/pinyin-data.git
[submodule "_tools/phrase-data"]
path = _tools/phrase-data
url = https://github.com/hotoo/pinyin.git
18 changes: 6 additions & 12 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,28 +1,22 @@
language: go
go:
- '1.7'
- '1.8'
- '1.9.x'
- '1.10.x'
- '1.11.x'
- '1.12.x'
- 'master'
- "1.9.x"
- "1.10.x"
- "1.11.x"
- "1.12.x"
- "1.13.x"

sudo: false

before_install:
- if ! go get code.google.com/p/go.tools/cmd/cover; then go get golang.org/x/tools/cmd/cover; fi
- go get github.com/mattn/go-isatty
- go get github.com/axw/gocov/gocov
- go get github.com/mattn/goveralls
- go get ./cmd/pinyin

script:
- go run cmd/pinyin/main.go abc
- go run cmd/pinyin/main.go -s zhao abc
- echo "abc" | go run cmd/pinyin/main.go
- echo "abc" > abc.txt && go run cmd/pinyin/main.go < abc.txt
- $HOME/gopath/bin/goveralls -service=travis-ci -v -package .

matrix:
allow_failures:
- go: master
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# Changelog

## [0.16.0] (2019-12-05)

* **NEW** 增加 ``func Paragraph(p string) string`` 用于便捷处理大段文字
(thanks [@huacnlee] via [#37][#37])

## [0.15.0] (2019-04-06)

* **Changed** 使用 [pinyin-data][pinyin-data] v0.7.0 的拼音数据
Expand Down Expand Up @@ -193,9 +198,11 @@

[pinyin-data]: https://github.com/mozillazg/pinyin-data
[@wdscxsj]: https://github.com/wdscxsj
[@huacnlee]: https://github.com/huacnlee
[#19]: https://github.com/mozillazg/go-pinyin/pull/19
[#20]: https://github.com/mozillazg/go-pinyin/pull/20
[#30]: https://github.com/mozillazg/go-pinyin/pull/30
[#37]: https://github.com/mozillazg/go-pinyin/pull/37

[0.1.1]: https://github.com/mozillazg/go-pinyin/compare/v0.1.0...v0.1.1
[0.2.0]: https://github.com/mozillazg/go-pinyin/compare/v0.1.1...v0.2.0
Expand All @@ -213,3 +220,4 @@
[0.13.0]: https://github.com/mozillazg/go-pinyin/compare/v0.12.0...v0.13.0
[0.14.0]: https://github.com/mozillazg/go-pinyin/compare/v0.13.0...v0.14.0
[0.15.0]: https://github.com/mozillazg/go-pinyin/compare/v0.14.0...v0.15.0
[0.16.0]: https://github.com/mozillazg/go-pinyin/compare/v0.15.0...v0.16.0
7 changes: 6 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,12 @@ test:

.PHONY: gen_pinyin_dict
gen_pinyin_dict:
@go run _tools/gen_pinyin_dict.go _tools/pinyin-data/pinyin.txt pinyin_dict.go
@go run _tools/gen_pinyin_dict/main.go _tools/pinyin-data/pinyin.txt pinyin_dict.go

.PHONY: gen_phrase_dict
gen_phrase_dict:
@go run _tools/gen_phrase_dict/main.go _tools/phrase-data/data/phrases-dict.js phrase_dict.go
@goreturns -w phrase_dict.go

.PHONY: lint
lint:
Expand Down
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,10 @@ func main() {

fmt.Println(pinyin.LazyConvert(hans, nil))
// [zhong guo ren]

// 段落转换,支持完整支持多音字,保留符号
fmt.Println(pinyin.Paragraph("交给团长,告诉他我们给予期望。前线的供给一定要能自给自足!"))
// jiao gei tuan zhang, gao su ta wo men ji yu qi wang. qian xian de gong ji yi ding yao neng zi ji zi zu!
}
```

Expand Down
92 changes: 92 additions & 0 deletions _tools/gen_phrase_dict/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
package main

import (
"bufio"
"flag"
"fmt"
"io"
"os"
"strings"
)

type cmdArgs struct {
inputFile string
outputFile string
}

func genCode(inFile *os.File, outFile *os.File) {
rd := bufio.NewReader(inFile)
output := `package pinyin
// phraseDict is data map
//
// Generate from:
// https://github.com/hotoo/pinyin/blob/master/data/phrases-dict.js
//
// Warning: Auto-generated file, don't edit.
// If you want add more words, use phrase_dict_addition.go
var phraseDict = map[string]string{
`
lines := []string{}

for {
line, err := rd.ReadString('\n')
if err == io.EOF {
break
} else if err != nil {
panic(err)
}

// Remove prefix space
line = strings.TrimSpace(line)

// `"后来居上": [["hòu"], ["lái"], ["jū"], ["shàng"]],` to `"后来居上": "hòu lái jū shàng",`
if !strings.HasPrefix(line, `"`) {
continue
}

line = strings.ReplaceAll(line, `[`, "")
line = strings.ReplaceAll(line, `]`, "")
line = strings.ReplaceAll(line, `", "`, " ")

lines = append(lines, line)
}

output += strings.Join(lines, "\n")
output += "\n}\n"
outFile.WriteString(output)
return
}

func parseCmdArgs() cmdArgs {
flag.Parse()
inputFile := flag.Arg(0)
outputFile := flag.Arg(1)
return cmdArgs{inputFile, outputFile}
}

func main() {
args := parseCmdArgs()
usage := "gen_phrase_dict INPUT OUTPUT"
inputFile := args.inputFile
outputFile := args.outputFile
if inputFile == "" || outputFile == "" {
fmt.Println(usage)
os.Exit(1)
}

inFp, err := os.Open(inputFile)
if err != nil {
fmt.Printf("open file %s error", inputFile)
panic(err)
}
outFp, err := os.Create(outputFile)
if err != nil {
fmt.Printf("open file %s error", outputFile)
panic(err)
}
defer inFp.Close()
defer outFp.Close()

genCode(inFp, outFp)
}
File renamed without changes.
1 change: 1 addition & 0 deletions _tools/phrase-data
Submodule phrase-data added at f2b37a
6 changes: 6 additions & 0 deletions example_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,3 +135,9 @@ func ExampleSlug() {
fmt.Println(pinyin.Slug(hans, a))
// Output: zhong-guo-ren
}

func ExampleParagraph() {
hans := "人民银行旁边一行人abc字母【路牌】,平行宇宙发行股票。"
fmt.Println(pinyin.Paragraph(hans))
// Output: ren min yin xing pang bian yi xing ren abc zi mu [lu pai], ping xing yu zhou fa xing gu piao.
}
5 changes: 5 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
@@ -1 +1,6 @@
module github.com/mozillazg/go-pinyin

require (
github.com/mattn/go-isatty v0.0.10
github.com/yanyiwu/gojieba v1.1.0
)
6 changes: 6 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
github.com/mattn/go-isatty v0.0.10 h1:qxFzApOv4WsAL965uUPIsXzAKCZxN2p9UqdhFS4ZW10=
github.com/mattn/go-isatty v0.0.10/go.mod h1:qgIWMr58cqv1PHHyhnkY9lrL7etaEgOFcMEpPG5Rm84=
github.com/yanyiwu/gojieba v1.1.0 h1:rx+kNP0L7zn+4Zyo1d8aLCfXt3BmIkWQ26FCbGnw1bc=
github.com/yanyiwu/gojieba v1.1.0/go.mod h1:0AAj9tOG6WWXQ5FNffl4ruBy/hP7bHl2gs+YiDi1aYs=
golang.org/x/sys v0.0.0-20191008105621-543471e840be h1:QAcqgptGM8IQBC9K/RC4o+O9YmqEm0diQn9QmZw/0mU=
golang.org/x/sys v0.0.0-20191008105621-543471e840be/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
75 changes: 75 additions & 0 deletions paragraph.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
package pinyin

import (
"regexp"
"strings"
"unicode"
)

var (
splacesRegexp = regexp.MustCompile(`[\s]+`)
allowCharsRegexp = regexp.MustCompile(`[a-zA-Z0-9\.,\?\!;\(\)\[\]\&\=\-_@\s]`)
hansSymbols = map[string]string{
"?": "?",
"!": "!",
":": ":",
"。": ".",
",": ",",
";": ";",
"(": "(",
")": ")",
"【": "[",
"】": "]",
}
paragraphOption = Args{
Style: NORMAL,
Heteronym: true,
}
)

// Paragraph convert a Chinese paragraph into pinyin, including letters, numbers, symbols
func Paragraph(p string) (s string) {
p = pinyinPhrase(p)

for _, r := range p {
if unicode.Is(unicode.Han, r) {
// Han chars
result := Pinyin(string(r), paragraphOption)
if len(result) == 0 {
continue
}
if len(result[0]) == 0 {
continue
}

s += " " + string(result[0][0]) + " "
} else {
// Other chars
char := string(r)

if allowCharsRegexp.MatchString(char) {
s += char
} else {
if hansSymbols[char] != "" {
s += hansSymbols[char]
}
}
}
}

// 去连续两个空格
s = splacesRegexp.ReplaceAllString(s, " ")
// 去掉 , . ? 前面的空格
s = strings.Replace(s, " ,", ",", -1)
s = strings.Replace(s, " .", ".", -1)
s = strings.Replace(s, " ?", "?", -1)
s = strings.Replace(s, " ;", ";", -1)
s = strings.Replace(s, " !", "!", -1)
s = strings.Replace(s, "( ", "(", -1)
s = strings.Replace(s, " )", ")", -1)
s = strings.Replace(s, "[ ", "[", -1)
s = strings.Replace(s, " ]", "]", -1)
s = strings.Replace(s, " :", ":", -1)
s = strings.TrimSpace(s)
return
}
38 changes: 38 additions & 0 deletions paragraph_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
package pinyin

import (
"testing"
)

func BenchmarkParagraph(b *testing.B) {
for i := 0; i < b.N; i++ {
// about 0.06ms/op
Paragraph("这条恶狗真可恶 满身臭味 让人闻了就恶心 让人厌恶 像恶魔让人做恶梦")
}
}

func TestParagraph(t *testing.T) {
expects := map[string]string{
"天府大道北段18号高新国际广场A-3号!": "tian fu da dao bei duan 18 hao gao xin guo ji guang chang A-3 hao!",
"人民银行旁边一行人abc字母路牌,平行宇宙发行股票": "ren min yin xing pang bian yi xing ren abc zi mu lu pai, ping xing yu zhou fa xing gu piao",
"我的大王!": "wo de dai wang!",
"A abc&1234个人. 银行旁边,一行人?": "A abc&1234 ge ren. yin xing pang bian, yi xing ren?",
"A abc&1234个人,中国(银行)旁边;一“行”人?": "A abc&1234 ge ren, zhong guo (yin xing) pang bian; yi xing ren?",
"字符串长桥: aBc多音字&123": "zi fu chuan chang qiao aBc duo yin zi &123",
"【腾讯(00700)拟2.305亿元出售应收账款 赚1720万元】": "[teng xun (00700) ni 2.305 yi yuan chu shou ying shou zhang kuan zhuan 1720 wan yuan]",
"地址:重庆市江北区重工业?": "di zhi: chong qing shi jiang bei qu zhong gong ye?",
"交给团长,告诉他我们给予期望。前线的供给一定要能自给自足!": "jiao gei tuan zhang, gao su ta wo men ji yu qi wang. qian xian de gong ji yi ding yao neng zi ji zi zu!",
"abc123": "abc123",
"義灬骉驫芔淼㴇": "yi biao biao biao hui miao she",
"": "",
}

for source, expect := range expects {
actual := Paragraph(source)
if expect != actual {
t.Errorf("\nexpect: %s\nactual: %s\n", expect, actual)
break
}
}

}
33 changes: 33 additions & 0 deletions phrase.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
package pinyin

import (
"strings"

"github.com/yanyiwu/gojieba"
)

var (
jieba = gojieba.NewJieba()
)

func cutWords(s string) []string {
return jieba.CutAll(s)
}

func pinyinPhrase(s string) string {
words := cutWords(s)
for _, word := range words {
match := phraseDict[word]
if match == "" {
match = phraseDictAddition[word]
}

match = toFixed(match, paragraphOption)
if match != "" {
s = strings.Replace(s, word, " "+match+" ", 1)
}
}

return s
}

Loading

0 comments on commit c9122b8

Please sign in to comment.