From 6164f578643e37610988c09261dac88894ae1987 Mon Sep 17 00:00:00 2001 From: Fabio Bonelli Date: Mon, 12 Oct 2020 11:25:45 +0200 Subject: [PATCH] Skip publishers if they contain duplicate orgs. (#198) crawl now skips publishers if they contain an organization that is already been seen. The order of preference of the YAML files containing the publishers is deduced by the ordering of the arguments to the crawl commmand. Note we don't check for duplicates repos, only organizations. Fix #185. --- crawler/cmd/crawl.go | 15 ++++++++++++++- crawler/start.sh | 2 +- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/crawler/cmd/crawl.go b/crawler/cmd/crawl.go index 584ec62f..61d7890c 100644 --- a/crawler/cmd/crawl.go +++ b/crawler/cmd/crawl.go @@ -16,6 +16,7 @@ var crawlCmd = &cobra.Command{ Long: `Crawl publiccode.yml files according to the supplied whitelist file(s).`, Args: cobra.MinimumNArgs(1), Run: func(cmd *cobra.Command, args []string) { + orgs := make(map[string]bool) c := crawler.NewCrawler() // Read the supplied whitelists. @@ -25,7 +26,19 @@ var crawlCmd = &cobra.Command{ if err != nil { log.Fatal(err) } - publishers = append(publishers, readWhitelist...) + + Publisher: + for _, publisher := range readWhitelist { + for _, org := range publisher.Organizations { + if orgs[org] { + log.Warnf("Skipping publisher '%s': organization '%s' already present", publisher.Name, org) + continue Publisher + } else { + orgs[org] = true + } + } + publishers = append(publishers, publisher) + } } toBeRemoved, err := c.CrawlPublishers(publishers) diff --git a/crawler/start.sh b/crawler/start.sh index b223440b..6f6d20ee 100755 --- a/crawler/start.sh +++ b/crawler/start.sh @@ -10,5 +10,5 @@ echo "${0##*/}: Waiting ${time} seconds before running the crawler..." sleep ${time} bin/crawler updateipa -bin/crawler download-whitelist https://onboarding.developers.italia.it/repo-list whitelist/onboarding-reuse.yml +bin/crawler download-whitelist https://onboarding.developers.italia.it/repo-list whitelist/00-onboarding-reuse.yml bin/crawler crawl whitelist/*.yml