From 413f7b415f571eed361107db4c77c406354d4366 Mon Sep 17 00:00:00 2001 From: PrinceCaspian Date: Sat, 21 Oct 2023 18:53:00 -0500 Subject: [PATCH 1/3] Scrape 'Research Tags' for professor profiles --- scrapers/profiles.go | 47 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 38 insertions(+), 9 deletions(-) diff --git a/scrapers/profiles.go b/scrapers/profiles.go index 7ef64be..468e2cf 100644 --- a/scrapers/profiles.go +++ b/scrapers/profiles.go @@ -5,21 +5,22 @@ import ( "encoding/json" "errors" "fmt" + "os" + "regexp" + "strconv" + "strings" + "github.com/UTDNebula/nebula-api/schema" "github.com/chromedp/cdproto/cdp" "github.com/chromedp/cdproto/runtime" "github.com/chromedp/chromedp" "go.mongodb.org/mongo-driver/bson/primitive" - "os" - "regexp" - "strconv" - "strings" ) const BASE_URL string = "https://profiles.utdallas.edu/browse?page=" -var primaryLocationRegex *regexp.Regexp = regexp.MustCompile("^(\\w+)\\s+(\\d+\\.\\d{3}[A-z]?)$") -var fallbackLocationRegex *regexp.Regexp = regexp.MustCompile("^([A-z]+)(\\d+)\\.?(\\d{3}[A-z]?)$") +var primaryLocationRegex *regexp.Regexp = regexp.MustCompile(`^(\\w+)\\s+(\\d+\\.\\d{3}[A-z]?)$`) +var fallbackLocationRegex *regexp.Regexp = regexp.MustCompile(`^([A-z]+)(\\d+)\\.?(\\d{3}[A-z]?)$`) func parseLocation(text string) schema.Location { var building string @@ -118,7 +119,7 @@ func scrapeProfessorLinks() []string { for _, node := range nodes { href, hasHref := node.Attribute("href") if !hasHref { - return errors.New("Professor card was missing an href!") + return errors.New("professor card was missing an href") } professorLinks = append(professorLinks, href) } @@ -183,7 +184,7 @@ func ScrapeProfiles(outDir string) { var hasSrc bool imageUri, hasSrc = attributes["src"] if !hasSrc { - return errors.New("No src found for imageUri!") + return errors.New("no src found for imageUri") } } return err @@ -198,7 +199,7 @@ func ScrapeProfiles(outDir string) { var hasStyle bool imageUri, hasStyle = attributes["style"] if !hasStyle { - return errors.New("No style found for imageUri!") + return errors.New("no style found for imageUri") } imageUri = imageUri[23 : len(imageUri)-3] } @@ -255,6 +256,7 @@ func ScrapeProfiles(outDir string) { var tempText string err := chromedp.Text("div.contact_info > div", &tempText).Do(ctx) texts = strings.Split(tempText, "\n") + fmt.Println(tempText) return err }, ), @@ -267,6 +269,32 @@ func ScrapeProfiles(outDir string) { phoneNumber, office := parseList(texts) fmt.Printf("Parsed list! #: %s, Office: %v\n\n", phoneNumber, office) + //Get the Tags + var tags map[string]string = map[string]string{} + fmt.Printf("Scraping tags...\n") + _, err = chromedp.RunResponse(chromedpCtx, + chromedp.Navigate(link), + chromedp.QueryAfter(".tags-badge", + func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { + for _, node := range nodes { + tempText := getNodeText(node) + href, hasHref := node.Attribute("href") + if !hasHref { + return errors.New("professor card was missing an href") + } + tags[tempText] = href + } + return nil + }, chromedp.AtLeast(0), + ), + ) + + if err != nil { + panic(err) + } + + fmt.Printf("Parsed tags! #: %s\n", tags) + professors = append(professors, schema.Professor{ Id: schema.IdWrapper{Id: primitive.NewObjectID()}, First_name: firstName, @@ -279,6 +307,7 @@ func ScrapeProfiles(outDir string) { Image_uri: imageUri, Office_hours: []schema.Meeting{}, Sections: []schema.IdWrapper{}, + Tags: tags, }) fmt.Printf("Scraped profile for %s %s!\n\n", firstName, lastName) From 9279c8b4a53b4e30f3b9d1d01c18143826b8d3d9 Mon Sep 17 00:00:00 2001 From: PrinceCaspian Date: Sun, 22 Oct 2023 14:47:56 -0500 Subject: [PATCH 2/3] Education Scraper --- scrapers/profiles.go | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/scrapers/profiles.go b/scrapers/profiles.go index 468e2cf..a63bdde 100644 --- a/scrapers/profiles.go +++ b/scrapers/profiles.go @@ -12,6 +12,7 @@ import ( "github.com/UTDNebula/nebula-api/schema" "github.com/chromedp/cdproto/cdp" + "github.com/chromedp/cdproto/dom" "github.com/chromedp/cdproto/runtime" "github.com/chromedp/chromedp" "go.mongodb.org/mongo-driver/bson/primitive" @@ -271,7 +272,8 @@ func ScrapeProfiles(outDir string) { //Get the Tags var tags map[string]string = map[string]string{} - fmt.Printf("Scraping tags...\n") + var educations [][]string = [][]string{} + fmt.Printf("Scraping tags and Educations...\n") _, err = chromedp.RunResponse(chromedpCtx, chromedp.Navigate(link), chromedp.QueryAfter(".tags-badge", @@ -287,13 +289,39 @@ func ScrapeProfiles(outDir string) { return nil }, chromedp.AtLeast(0), ), + chromedp.QueryAfter("#preparation>div", + func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { + for _, node := range nodes { + //This successfully gets to the correct divs, + //however major workarounds are required because there is text not within any node + element, err := dom.GetOuterHTML().WithNodeID(node.NodeID).Do(ctx) + + if err != nil { + return err + } + + regexSplitter := regexp.MustCompile(`\s?<[\w+" "|=]*>\s?|\s?<\/[\w]*>\s?|[\s]{2,}|\t|\s?-\s?`) + out := []string{} + + for _, val := range regexSplitter.Split(element, -1) { + if val != "" { + out = append(out, val) + } + } + + educations = append(educations, out) + } + return nil + }, chromedp.AtLeast(0), + ), ) if err != nil { panic(err) } - fmt.Printf("Parsed tags! #: %s\n", tags) + fmt.Printf("Scraped tags! #: %s\n", tags) + fmt.Printf("Scraped educations! #: %s\n", educations) professors = append(professors, schema.Professor{ Id: schema.IdWrapper{Id: primitive.NewObjectID()}, @@ -308,6 +336,7 @@ func ScrapeProfiles(outDir string) { Office_hours: []schema.Meeting{}, Sections: []schema.IdWrapper{}, Tags: tags, + Education: educations, }) fmt.Printf("Scraped profile for %s %s!\n\n", firstName, lastName) From 1ef34c4e6b161667ecbfc8ce3b489c5fcac112e2 Mon Sep 17 00:00:00 2001 From: PrinceCaspian Date: Fri, 27 Oct 2023 15:34:36 -0500 Subject: [PATCH 3/3] Add logs to profile changes & stop renavigating --- scrapers/profiles.go | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/scrapers/profiles.go b/scrapers/profiles.go index 5de9ad7..8ea3507 100644 --- a/scrapers/profiles.go +++ b/scrapers/profiles.go @@ -274,9 +274,8 @@ func ScrapeProfiles(outDir string) { //Get the Tags var tags map[string]string = map[string]string{} var educations [][]string = [][]string{} - fmt.Printf("Scraping tags and Educations...\n") - _, err = chromedp.RunResponse(chromedpCtx, - chromedp.Navigate(link), + log.Printf("Scraping tags and Educations...\n") + err = chromedp.Run(chromedpCtx, chromedp.QueryAfter(".tags-badge", func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { for _, node := range nodes { @@ -290,6 +289,13 @@ func ScrapeProfiles(outDir string) { return nil }, chromedp.AtLeast(0), ), + ) + + if err != nil { + panic(err) + } + + err = chromedp.Run(chromedpCtx, chromedp.QueryAfter("#preparation>div", func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { for _, node := range nodes { @@ -321,8 +327,8 @@ func ScrapeProfiles(outDir string) { panic(err) } - fmt.Printf("Scraped tags! #: %s\n", tags) - fmt.Printf("Scraped educations! #: %s\n", educations) + log.Printf("Scraped tags! #: %s\n", tags) + log.Printf("Scraped educations! #: %s\n", educations) professors = append(professors, schema.Professor{ Id: schema.IdWrapper{Id: primitive.NewObjectID()},