Skip to content

Commit

Permalink
new method for username #4
Browse files Browse the repository at this point in the history
  • Loading branch information
hjhee committed Apr 27, 2020
1 parent f099948 commit 5997cbe
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 25 deletions.
10 changes: 5 additions & 5 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import (
)

const (
version = "v0.1.3.4"
version = "v0.1.3.5"
numFetcher = 10
numParser = 50
numRenderer = 5
Expand All @@ -35,7 +35,7 @@ func init() {

outputPath := "./output"
if _, err := os.Stat(outputPath); os.IsNotExist(err) {
err = os.Mkdir(outputPath, 0644)
err = os.Mkdir(outputPath, 0755)
if err != nil {
log.Fatalf("Error creating output folder: %v", err)
}
Expand Down Expand Up @@ -90,21 +90,21 @@ func main() {
log.Printf("[Fetch] job done")
continue
}
log.Printf("[Fetch] error: %v\n", err)
fmt.Fprintf(os.Stderr, "[Fetch] error: %v\n", err)
case err, ok := <-errcParse:
if !ok {
errcParse = nil
log.Printf("[Parse] job done")
continue
}
log.Printf("[Parse] error: %v\n", err)
fmt.Fprintf(os.Stderr, "[Parse] error: %v\n", err)
case err, ok := <-errcRender:
if !ok {
errcRender = nil
log.Printf("[Template] job done")
continue
}
log.Printf("[Template] error: %v\n", err)
fmt.Fprintf(os.Stderr, "[Template] error: %v\n", err)
case file, ok := <-outputc:
if ok {
log.Printf("[Template] %s done\n", file)
Expand Down
40 changes: 23 additions & 17 deletions parse.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@ import (
"log"
"math/rand"
"net/url"
"os"
"regexp"
"strconv"
"strings"
"sync"
"time"

Expand All @@ -34,7 +34,7 @@ func htmlParse(pc *PageChannel, page *HTMLPage, tmMap *TemplateMap, callback fun
return fmt.Errorf("Error parsing %s: %v", page.URL, err)
}

posts := doc.Find("div.l_post.l_post_bright.j_l_post.clearfix")
posts := doc.Find("div.l_post.j_l_post.l_post_bright")
threadRegex := regexp.MustCompile(`\b"?thread_id"?:"?(\d+)"?\b`)
match := threadRegex.FindStringSubmatch(string(page.Content))
strInt, _ := strconv.ParseInt(match[1], 10, 64)
Expand Down Expand Up @@ -114,7 +114,7 @@ func homepageParser(done <-chan struct{}, page *HTMLPage, pc *PageChannel, tmMap
q.Set("fid", strconv.Itoa(int(forumID)))
q.Set("pn", strconv.Itoa(int(i)))
u.RawQuery = q.Encode()
log.Printf("requesting totalComment: %s", u)
// log.Printf("requesting totalComment: %s", u)
select {
case <-done:
return
Expand Down Expand Up @@ -146,27 +146,27 @@ func pageParser(done <-chan struct{}, page *HTMLPage, pc *PageChannel, tmMap *Te
}()
defer tf.AddPage(-1)
posts.Each(func(i int, s *goquery.Selection) {
// filter elements that has more than 4 class (maybe an advertisement)
classStr, _ := s.Attr("class") // get class string
if len(strings.Fields(classStr)) > 4 {
return
}

dataField, ok := s.Attr("data-field")
if !ok {
// maybe not an error, but an older version of data-field
// log.Printf("#%d data-field not found: %s", i, page.URL.String()) // there's a error on the page, maybe Tieba updated the syntax
fmt.Fprintf(os.Stderr, "#%d data-field not found: %s\n", i, page.URL) // there's a error on the page, maybe Tieba updated the syntax
return
}

var tiebaPost TiebaField
var res OutputField
err := json.Unmarshal([]byte(dataField), &tiebaPost)
if err != nil {
log.Printf("#%d data-field unmarshal failed: %v, url: %s", i, err, page.URL) // there's a error on the page, maybe Tieba updated the syntax
fmt.Fprintf(os.Stderr, "#%d data-field unmarshal failed: %v, url: %s\n", i, err, page.URL) // there's a error on the page, maybe Tieba updated the syntax
return
}
res.UserName = tiebaPost.Author.UserName
if content, err := s.Find("div.d_author ul.p_author li.d_name a.p_author_name.j_user_card").Html(); err != nil {
fmt.Fprintf(os.Stderr, "#%d Error parsing username from %s\n", i, page.URL)
return
} else {
res.UserName = template.HTML(content)
}

res.Content = template.HTML(tiebaPost.Content.Content)
res.PostNO = tiebaPost.Content.PostNO
res.PostID = tiebaPost.Content.PostID
Expand Down Expand Up @@ -352,11 +352,17 @@ func commentParser(done <-chan struct{}, page *HTMLPage, pc *PageChannel, tmMap
if err != nil {
return
}
user := s.Find(".j_user_card.lzl_p_p")
userName, ok := user.Attr("username")
if !ok {
// userName not found
log.Printf("ExLzl: cannot find username for pid=%s, index=%d", pid, i+pageNum*10)
user := s.Find("div.lzl_cnt a.at.j_user_card")
userName := user.Text()
// userName, ok := user.Attr("username")
// if !ok {
// // userName not found
// log.Printf("ExLzl: cannot find username for pid=%s, index=%d", pid, i+pageNum*10)
// return
// } else
if userName == "" {
// user name is empty, try another method
log.Printf("ExLzl: please check url: %s", page.URL)
return
}
c := &LzlContent{
Expand Down
2 changes: 1 addition & 1 deletion template.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ func renderHTML(done <-chan struct{}, tempc <-chan *TemplateField, tmpl *templat
err = writeOutput(filename, func(w *bufio.Writer) error {
if err := tmpl.Execute(w, struct {
Title string
Url string
Url string
Comments []*OutputField
Lzls map[uint64]*LzlComment
}{Title: t.Title, Url: t.Url, Comments: t.Comments, Lzls: t.Lzls.Map}); err != nil {
Expand Down
4 changes: 2 additions & 2 deletions type.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ type LzlPageComment struct {

// OutputField render Tieba post in template
type OutputField struct {
UserName string
UserName template.HTML
Content template.HTML
PostNO uint64
PostID uint64
Expand Down Expand Up @@ -187,7 +187,7 @@ func (lzl *LzlMap) IsExist(k uint64) bool {
// TemplateField stores all necessary information to render a HTML page
type TemplateField struct {
Title string
Url string
Url string
ThreadID uint64
Comments []*OutputField
pagesLeft int64
Expand Down

0 comments on commit 5997cbe

Please sign in to comment.