-
Notifications
You must be signed in to change notification settings - Fork 111
/
parser.go
90 lines (79 loc) · 2.04 KB
/
parser.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
package goose
import (
"github.com/PuerkitoBio/goquery"
"golang.org/x/net/html"
)
// Parser is an HTML parser specialised in extraction of main content and other properties
type Parser struct{}
// NewParser returns an HTML parser
func NewParser() *Parser {
return &Parser{}
}
func (p Parser) dropTag(selection *goquery.Selection) {
selection.Each(func(i int, s *goquery.Selection) {
replaceTagWithContents(s, whitelistedTextAtomTypes)
})
}
func (p Parser) indexOfAttribute(selection *goquery.Selection, attr string) int {
node := selection.Get(0)
for i, a := range node.Attr {
if a.Key == attr {
return i
}
}
return -1
}
func (p Parser) delAttr(selection *goquery.Selection, attr string) {
idx := p.indexOfAttribute(selection, attr)
if idx > -1 {
node := selection.Get(0)
node.Attr = append(node.Attr[:idx], node.Attr[idx+1:]...)
}
}
func (p Parser) getElementsByTags(div *goquery.Selection, tags []string) *goquery.Selection {
selection := new(goquery.Selection)
for _, tag := range tags {
selections := div.Find(tag)
if selections != nil {
selection = selection.Union(selections)
}
}
return selection
}
func (p Parser) clear(selection *goquery.Selection) {
selection.Nodes = make([]*html.Node, 0)
}
func (p Parser) removeNode(selection *goquery.Selection) {
if selection != nil {
node := selection.Get(0)
if node != nil && node.Parent != nil {
node.Parent.RemoveChild(node)
}
}
}
func (p Parser) name(selector string, selection *goquery.Selection) string {
value, exists := selection.Attr(selector)
if exists {
return value
}
return ""
}
func (p Parser) setAttr(selection *goquery.Selection, attr string, value string) {
if selection.Size() > 0 {
node := selection.Get(0)
var attrs []html.Attribute
for _, a := range node.Attr {
if a.Key != attr {
newAttr := new(html.Attribute)
newAttr.Key = a.Key
newAttr.Val = a.Val
attrs = append(attrs, *newAttr)
}
}
newAttr := new(html.Attribute)
newAttr.Key = attr
newAttr.Val = value
attrs = append(attrs, *newAttr)
node.Attr = attrs
}
}