From 071ab832700325b759718d06d8bfbfc9c3d3365e Mon Sep 17 00:00:00 2001 From: Fabio Bonelli Date: Thu, 30 May 2024 14:28:56 +0200 Subject: [PATCH] fix: reduce go routines and add debug logging The number of go routines could explode the more Publishers there are, despite out bottleneck being the GitHub API rate limiting. Try to be more conservative with go routines. --- crawler/crawler.go | 8 ++++++-- scanner/bitbucket.go | 4 ++++ scanner/github.go | 4 ++++ scanner/gitlab.go | 5 +++++ 4 files changed, 19 insertions(+), 2 deletions(-) diff --git a/crawler/crawler.go b/crawler/crawler.go index c243cbf..92c304a 100644 --- a/crawler/crawler.go +++ b/crawler/crawler.go @@ -129,7 +129,7 @@ func (c *Crawler) CrawlPublishers(publishers []common.Publisher) error { // Process every item in publishers. for _, publisher := range publishers { c.publishersWg.Add(1) - go c.ScanPublisher(publisher) + c.ScanPublisher(publisher) } // Close the repositories channel when all the publisher goroutines are done @@ -151,11 +151,15 @@ func (c *Crawler) crawl() error { // Get cpus number numCPUs := runtime.NumCPU() + log.Debugf("CPUs #: %d", numCPUs) // Process the repositories in order to retrieve the files. for i := 0; i < numCPUs; i++ { c.repositoriesWg.Add(1) - go c.ProcessRepositories(reposChan) + go func(id int) { + log.Debugf("Starting ProcessRepositories() goroutine (#%d)", id) + c.ProcessRepositories(reposChan) + }(i) } for repo := range c.repositories { diff --git a/scanner/bitbucket.go b/scanner/bitbucket.go index 40a0625..09724ba 100644 --- a/scanner/bitbucket.go +++ b/scanner/bitbucket.go @@ -22,6 +22,8 @@ func NewBitBucketScanner() Scanner { func (scanner BitBucketScanner) ScanGroupOfRepos( url url.URL, publisher common.Publisher, repositories chan common.Repository, ) error { + log.Debugf("BitBucketScanner.ScanGroupOfRepos(%s)", url.String()) + splitted := strings.Split(strings.Trim(url.Path, "/"), "/") if len(splitted) != 1 { @@ -82,6 +84,8 @@ func (scanner BitBucketScanner) ScanGroupOfRepos( func (scanner BitBucketScanner) ScanRepo( url url.URL, publisher common.Publisher, repositories chan common.Repository, ) error { + log.Debugf("BitBucketScanner.ScanRepo(%s)", url.String()) + splitted := strings.Split(strings.Trim(url.Path, "/"), "/") if len(splitted) != 2 { return fmt.Errorf("bitbucket URL %s doesn't look like a repo", url.String()) diff --git a/scanner/github.go b/scanner/github.go index 4f87265..b406dd8 100644 --- a/scanner/github.go +++ b/scanner/github.go @@ -47,6 +47,8 @@ func NewGitHubScanner() Scanner { func (scanner GitHubScanner) ScanGroupOfRepos( url url.URL, publisher common.Publisher, repositories chan common.Repository, ) error { + log.Debugf("GitHubScanner.ScanGroupOfRepos(%s)", url.String()) + opt := &github.RepositoryListByOrgOptions{} splitted := strings.Split(strings.Trim(url.Path, "/"), "/") @@ -128,6 +130,8 @@ func (scanner GitHubScanner) ScanGroupOfRepos( func (scanner GitHubScanner) ScanRepo( url url.URL, publisher common.Publisher, repositories chan common.Repository, ) error { + log.Debugf("GitHubScanner.ScanRepo(%s)", url.String()) + splitted := strings.Split(strings.Trim(url.Path, "/"), "/") if len(splitted) != 2 { return fmt.Errorf("doesn't look like a GitHub repo %s", url.String()) diff --git a/scanner/gitlab.go b/scanner/gitlab.go index e4fb35f..0193954 100644 --- a/scanner/gitlab.go +++ b/scanner/gitlab.go @@ -8,6 +8,7 @@ import ( "strings" "github.com/italia/publiccode-crawler/v4/common" + log "github.com/sirupsen/logrus" "github.com/xanzy/go-gitlab" ) @@ -21,6 +22,8 @@ func NewGitLabScanner() Scanner { func (scanner GitLabScanner) ScanGroupOfRepos( url url.URL, publisher common.Publisher, repositories chan common.Repository, ) error { + log.Debugf("GitLabScanner.ScanGroupOfRepos(%s)", url.String()) + apiURL, _ := url.Parse("/api/v4") git, err := gitlab.NewClient(os.Getenv("GITLAB_TOKEN"), gitlab.WithBaseURL(apiURL.String())) if err != nil { @@ -68,6 +71,8 @@ func (scanner GitLabScanner) ScanGroupOfRepos( func (scanner GitLabScanner) ScanRepo( url url.URL, publisher common.Publisher, repositories chan common.Repository, ) error { + log.Debugf("GitLabScanner.ScanRepo(%s)", url.String()) + apiURL, _ := url.Parse("/api/v4") git, err := gitlab.NewClient(os.Getenv("GITLAB_TOKEN"), gitlab.WithBaseURL(apiURL.String())) if err != nil {