Skip to content

Commit

Permalink
Skip publishers if they contain duplicate orgs. (#198)
Browse files Browse the repository at this point in the history
crawl now skips publishers if they contain an organization that
is already been seen. The order of preference of the YAML files
containing the publishers is deduced by the ordering of the arguments
to the crawl commmand.

Note we don't check for duplicates repos, only organizations.

Fix #185.
  • Loading branch information
bfabio authored Oct 12, 2020
1 parent 9f22ae3 commit 6164f57
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 2 deletions.
15 changes: 14 additions & 1 deletion crawler/cmd/crawl.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ var crawlCmd = &cobra.Command{
Long: `Crawl publiccode.yml files according to the supplied whitelist file(s).`,
Args: cobra.MinimumNArgs(1),
Run: func(cmd *cobra.Command, args []string) {
orgs := make(map[string]bool)
c := crawler.NewCrawler()

// Read the supplied whitelists.
Expand All @@ -25,7 +26,19 @@ var crawlCmd = &cobra.Command{
if err != nil {
log.Fatal(err)
}
publishers = append(publishers, readWhitelist...)

Publisher:
for _, publisher := range readWhitelist {
for _, org := range publisher.Organizations {
if orgs[org] {
log.Warnf("Skipping publisher '%s': organization '%s' already present", publisher.Name, org)
continue Publisher
} else {
orgs[org] = true
}
}
publishers = append(publishers, publisher)
}
}

toBeRemoved, err := c.CrawlPublishers(publishers)
Expand Down
2 changes: 1 addition & 1 deletion crawler/start.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,5 @@ echo "${0##*/}: Waiting ${time} seconds before running the crawler..."
sleep ${time}

bin/crawler updateipa
bin/crawler download-whitelist https://onboarding.developers.italia.it/repo-list whitelist/onboarding-reuse.yml
bin/crawler download-whitelist https://onboarding.developers.italia.it/repo-list whitelist/00-onboarding-reuse.yml
bin/crawler crawl whitelist/*.yml

0 comments on commit 6164f57

Please sign in to comment.