diff --git a/crawler/cmd/crawl.go b/crawler/cmd/crawl.go index 584ec62f..61d7890c 100644 --- a/crawler/cmd/crawl.go +++ b/crawler/cmd/crawl.go @@ -16,6 +16,7 @@ var crawlCmd = &cobra.Command{ Long: `Crawl publiccode.yml files according to the supplied whitelist file(s).`, Args: cobra.MinimumNArgs(1), Run: func(cmd *cobra.Command, args []string) { + orgs := make(map[string]bool) c := crawler.NewCrawler() // Read the supplied whitelists. @@ -25,7 +26,19 @@ var crawlCmd = &cobra.Command{ if err != nil { log.Fatal(err) } - publishers = append(publishers, readWhitelist...) + + Publisher: + for _, publisher := range readWhitelist { + for _, org := range publisher.Organizations { + if orgs[org] { + log.Warnf("Skipping publisher '%s': organization '%s' already present", publisher.Name, org) + continue Publisher + } else { + orgs[org] = true + } + } + publishers = append(publishers, publisher) + } } toBeRemoved, err := c.CrawlPublishers(publishers) diff --git a/crawler/start.sh b/crawler/start.sh index b223440b..6f6d20ee 100755 --- a/crawler/start.sh +++ b/crawler/start.sh @@ -10,5 +10,5 @@ echo "${0##*/}: Waiting ${time} seconds before running the crawler..." sleep ${time} bin/crawler updateipa -bin/crawler download-whitelist https://onboarding.developers.italia.it/repo-list whitelist/onboarding-reuse.yml +bin/crawler download-whitelist https://onboarding.developers.italia.it/repo-list whitelist/00-onboarding-reuse.yml bin/crawler crawl whitelist/*.yml