diff --git a/main.go b/main.go index fa0fe5d..55bd7e2 100644 --- a/main.go +++ b/main.go @@ -10,7 +10,7 @@ import ( ) const ( - version = "v0.1.3.4" + version = "v0.1.3.5" numFetcher = 10 numParser = 50 numRenderer = 5 @@ -35,7 +35,7 @@ func init() { outputPath := "./output" if _, err := os.Stat(outputPath); os.IsNotExist(err) { - err = os.Mkdir(outputPath, 0644) + err = os.Mkdir(outputPath, 0755) if err != nil { log.Fatalf("Error creating output folder: %v", err) } @@ -90,21 +90,21 @@ func main() { log.Printf("[Fetch] job done") continue } - log.Printf("[Fetch] error: %v\n", err) + fmt.Fprintf(os.Stderr, "[Fetch] error: %v\n", err) case err, ok := <-errcParse: if !ok { errcParse = nil log.Printf("[Parse] job done") continue } - log.Printf("[Parse] error: %v\n", err) + fmt.Fprintf(os.Stderr, "[Parse] error: %v\n", err) case err, ok := <-errcRender: if !ok { errcRender = nil log.Printf("[Template] job done") continue } - log.Printf("[Template] error: %v\n", err) + fmt.Fprintf(os.Stderr, "[Template] error: %v\n", err) case file, ok := <-outputc: if ok { log.Printf("[Template] %s done\n", file) diff --git a/parse.go b/parse.go index c080345..a659d07 100644 --- a/parse.go +++ b/parse.go @@ -9,9 +9,9 @@ import ( "log" "math/rand" "net/url" + "os" "regexp" "strconv" - "strings" "sync" "time" @@ -34,7 +34,7 @@ func htmlParse(pc *PageChannel, page *HTMLPage, tmMap *TemplateMap, callback fun return fmt.Errorf("Error parsing %s: %v", page.URL, err) } - posts := doc.Find("div.l_post.l_post_bright.j_l_post.clearfix") + posts := doc.Find("div.l_post.j_l_post.l_post_bright") threadRegex := regexp.MustCompile(`\b"?thread_id"?:"?(\d+)"?\b`) match := threadRegex.FindStringSubmatch(string(page.Content)) strInt, _ := strconv.ParseInt(match[1], 10, 64) @@ -114,7 +114,7 @@ func homepageParser(done <-chan struct{}, page *HTMLPage, pc *PageChannel, tmMap q.Set("fid", strconv.Itoa(int(forumID))) q.Set("pn", strconv.Itoa(int(i))) u.RawQuery = q.Encode() - log.Printf("requesting totalComment: %s", u) + // log.Printf("requesting totalComment: %s", u) select { case <-done: return @@ -146,16 +146,10 @@ func pageParser(done <-chan struct{}, page *HTMLPage, pc *PageChannel, tmMap *Te }() defer tf.AddPage(-1) posts.Each(func(i int, s *goquery.Selection) { - // filter elements that has more than 4 class (maybe an advertisement) - classStr, _ := s.Attr("class") // get class string - if len(strings.Fields(classStr)) > 4 { - return - } - dataField, ok := s.Attr("data-field") if !ok { // maybe not an error, but an older version of data-field - // log.Printf("#%d data-field not found: %s", i, page.URL.String()) // there's a error on the page, maybe Tieba updated the syntax + fmt.Fprintf(os.Stderr, "#%d data-field not found: %s\n", i, page.URL) // there's a error on the page, maybe Tieba updated the syntax return } @@ -163,10 +157,16 @@ func pageParser(done <-chan struct{}, page *HTMLPage, pc *PageChannel, tmMap *Te var res OutputField err := json.Unmarshal([]byte(dataField), &tiebaPost) if err != nil { - log.Printf("#%d data-field unmarshal failed: %v, url: %s", i, err, page.URL) // there's a error on the page, maybe Tieba updated the syntax + fmt.Fprintf(os.Stderr, "#%d data-field unmarshal failed: %v, url: %s\n", i, err, page.URL) // there's a error on the page, maybe Tieba updated the syntax return } - res.UserName = tiebaPost.Author.UserName + if content, err := s.Find("div.d_author ul.p_author li.d_name a.p_author_name.j_user_card").Html(); err != nil { + fmt.Fprintf(os.Stderr, "#%d Error parsing username from %s\n", i, page.URL) + return + } else { + res.UserName = template.HTML(content) + } + res.Content = template.HTML(tiebaPost.Content.Content) res.PostNO = tiebaPost.Content.PostNO res.PostID = tiebaPost.Content.PostID @@ -352,11 +352,17 @@ func commentParser(done <-chan struct{}, page *HTMLPage, pc *PageChannel, tmMap if err != nil { return } - user := s.Find(".j_user_card.lzl_p_p") - userName, ok := user.Attr("username") - if !ok { - // userName not found - log.Printf("ExLzl: cannot find username for pid=%s, index=%d", pid, i+pageNum*10) + user := s.Find("div.lzl_cnt a.at.j_user_card") + userName := user.Text() + // userName, ok := user.Attr("username") + // if !ok { + // // userName not found + // log.Printf("ExLzl: cannot find username for pid=%s, index=%d", pid, i+pageNum*10) + // return + // } else + if userName == "" { + // user name is empty, try another method + log.Printf("ExLzl: please check url: %s", page.URL) return } c := &LzlContent{ diff --git a/template.go b/template.go index 245e805..2243f36 100644 --- a/template.go +++ b/template.go @@ -90,7 +90,7 @@ func renderHTML(done <-chan struct{}, tempc <-chan *TemplateField, tmpl *templat err = writeOutput(filename, func(w *bufio.Writer) error { if err := tmpl.Execute(w, struct { Title string - Url string + Url string Comments []*OutputField Lzls map[uint64]*LzlComment }{Title: t.Title, Url: t.Url, Comments: t.Comments, Lzls: t.Lzls.Map}); err != nil { diff --git a/type.go b/type.go index ce39861..97ef99b 100644 --- a/type.go +++ b/type.go @@ -149,7 +149,7 @@ type LzlPageComment struct { // OutputField render Tieba post in template type OutputField struct { - UserName string + UserName template.HTML Content template.HTML PostNO uint64 PostID uint64 @@ -187,7 +187,7 @@ func (lzl *LzlMap) IsExist(k uint64) bool { // TemplateField stores all necessary information to render a HTML page type TemplateField struct { Title string - Url string + Url string ThreadID uint64 Comments []*OutputField pagesLeft int64