diff --git a/README.md b/README.md index c9c3ca6..df3a124 100644 --- a/README.md +++ b/README.md @@ -9,15 +9,15 @@ Developers Italia provides [a catalog of Free and Open Source](https://developers.italia.it/en/search) software aimed to Public Administrations. -This **crawler** retrieves the `publiccode.yml` files from the +`publiccode-crawler` retrieves the `publiccode.yml` files from the repositories of publishers found in the [Developers Italia API](https://github.com/italia/developers-italia-api). ## Setup and deployment processes -The crawler can either run manually on the target machine or it can be deployed +`publiccode-crawler` can either run manually on the target machine or it can be deployed from a Docker container. -### Manually configure and build the crawler +### Manually configure and build 1. Rename `config.toml.example` to `config.toml` and set the variables @@ -43,16 +43,23 @@ docker run -it italia/publiccode-crawler ## Commands -### `crawler crawl` +### `publiccode-crawler crawl` Gets the list of publishers from `https://api.developers.italia.it/v1/publishers` and starts to crawl their repositories. -### `crawler crawl publishers*.yml` +### `publiccode-crawler crawl publishers*.yml` Gets the list of publishers in `publishers*.yml` and starts to crawl their repositories. +### `publiccode-crawler crawl-software ` + +Crawl just the software specified as paramenter. +It takes the software URL and its publisher id as parameters. + +Ex. `publiccode-crawler crawl-software https://api.developers.italia.it/v1/software/a2ea59b0-87cd-4419-b93f-00bed8a7b859 edb66b3d-3e36-4b69-aba9-b7c4661b3fdd" + ### Other commands * `crawler download-publishers` downloads organizations and repositories from diff --git a/apiclient/apiclient.go b/apiclient/apiclient.go index c08dfc6..ee019f0 100644 --- a/apiclient/apiclient.go +++ b/apiclient/apiclient.go @@ -192,6 +192,25 @@ page: return publishers, nil } +// GetSoftware returns the software with the given id or any error encountered. +func (clt APIClient) GetSoftware(id string) (*Software, error) { + var softwareResponse Software + + res, err := clt.retryableClient.Get(joinPath(clt.baseURL, "/software") + "/" + id) + if err != nil { + return nil, fmt.Errorf("can't GET /software/%s: %w", id, err) + } + + defer res.Body.Close() + + err = json.NewDecoder(res.Body).Decode(&softwareResponse) + if err != nil { + return nil, fmt.Errorf("can't parse GET /software/%s response: %w", id, err) + } + + return &softwareResponse, nil +} + // GetSoftwareByURL returns the software matching the given repo URL and // any error encountered. // In case no software is found and no error occours, (nil, nil) is returned. diff --git a/cmd/crawl-software.go b/cmd/crawl-software.go new file mode 100644 index 0000000..eb4f252 --- /dev/null +++ b/cmd/crawl-software.go @@ -0,0 +1,43 @@ +package cmd + +import ( + "github.com/italia/publiccode-crawler/v4/common" + "github.com/italia/publiccode-crawler/v4/crawler" + log "github.com/sirupsen/logrus" + "github.com/spf13/cobra" + "github.com/spf13/viper" +) + +func init() { + crawlSoftwareCmd.Flags().BoolVarP(&dryRun, "dry-run", "n", false, "perform a dry run with no changes made") + + rootCmd.AddCommand(crawlSoftwareCmd) +} + +var crawlSoftwareCmd = &cobra.Command{ + Use: "crawl-software [SOFTWARE_ID | SOFTWARE_URL] PUBLISHER_ID", + Short: "Crawl a single software by its id.", + Long: `Crawl a single software by its id. + +Crawl a single software given its API id and its publisher.`, + Example: "# Crawl just the specified software\n" + + "publiccode-crawler crawl-software" + + " https://api.developers.italia.it/v1/software/af6056fc-b2b2-4d31-9961-c9bd94e32bd4 PCM", + + Args: cobra.ExactArgs(2), + Run: func(_ *cobra.Command, args []string) { + if token := viper.GetString("GITHUB_TOKEN"); token == "" { + log.Fatal("Please set GITHUB_TOKEN, it's needed to use the GitHub API'") + } + + c := crawler.NewCrawler(dryRun) + + publisher := common.Publisher{ + ID: args[1], + } + + if err := c.CrawlSoftwareByID(args[0], publisher); err != nil { + log.Fatal(err) + } + }, +} diff --git a/cmd/crawl.go b/cmd/crawl.go index 36e9dc5..e1309d4 100644 --- a/cmd/crawl.go +++ b/cmd/crawl.go @@ -16,12 +16,21 @@ func init() { } var crawlCmd = &cobra.Command{ - Use: "crawl publishers.yml [directory/*.yml ...]", + Use: "crawl [publishers.yml] [directory/*.yml ...]", Short: "Crawl publiccode.yml files in publishers' repos.", Long: `Crawl publiccode.yml files in publishers' repos. When run with no arguments, the publishers are fetched from the API, otherwise the passed YAML files are used.`, + Example: ` +# Crawl publishers fetched from the API +crawl + +# Crawl using a specific publishers.yml file +crawl publishers.yml + +# Crawl all YAML files in a specific directory +crawl directory/*.yml`, Args: cobra.MinimumNArgs(0), Run: func(_ *cobra.Command, args []string) { diff --git a/cmd/one.go b/cmd/one.go deleted file mode 100644 index 2f6625a..0000000 --- a/cmd/one.go +++ /dev/null @@ -1,78 +0,0 @@ -package cmd - -import ( - "net/url" - "regexp" - - "github.com/italia/publiccode-crawler/v4/common" - "github.com/italia/publiccode-crawler/v4/crawler" - log "github.com/sirupsen/logrus" - "github.com/spf13/cobra" -) - -func init() { - oneCmd.Flags().BoolVarP(&dryRun, "dry-run", "n", false, "perform a dry run with no changes made") - - rootCmd.AddCommand(oneCmd) -} - -var oneCmd = &cobra.Command{ - Use: "one [repo url] publishers.*.yml", - Short: "Crawl publiccode.yml from one single [repo url].", - Long: `Crawl publiccode.yml from a single repository defined with [repo url] - according to the supplied file(s). - No organizations! Only single repositories!`, - Args: cobra.MinimumNArgs(2), - Run: func(_ *cobra.Command, args []string) { - c := crawler.NewCrawler(dryRun) - - paths := args[1:] - url, err := url.Parse(args[0]) - if err != nil { - log.Error(err) - } - - err = c.CrawlRepo(*url, getPublisher(*url, paths)) - if err != nil { - log.Error(err) - } - }, -} - -func getPublisher(repoURL url.URL, paths []string) common.Publisher { - var publishers []common.Publisher - var p common.Publisher - - for _, path := range paths { - p, err := common.LoadPublishers(path) - if err != nil { - log.Fatal(err) - } - publishers = append(publishers, p...) - } - - for _, publisher := range publishers { - // looking into repositories - for _, repo := range publisher.Repositories { - log.Tracef("matching %s with %s", repo.String(), repoURL.String()) - if (url.URL)(repo) == repoURL { - log.Debugf("Publisher found %+v", publisher) - - return publisher - } - } - // looking into organizations - for _, repo := range publisher.Organizations { - log.Tracef("matching %s.* with %s", repo.String(), repoURL.String()) - if matched, _ := regexp.MatchString(repo.String()+".*", repoURL.String()); matched { - log.Debugf("Publisher found %+v", publisher) - - return publisher - } - } - } - - log.Warn("Publisher not found in publishers list, slug will be generated without Id") - - return p -} diff --git a/cmd/root.go b/cmd/root.go index 2955891..53536c3 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -8,7 +8,7 @@ import ( var ( dryRun bool rootCmd = &cobra.Command{ - Use: "crawler", + Use: "publiccode-crawler", Short: "A crawler for publiccode.yml files.", Long: `A fast and robust publiccode.yml file crawler. Complete documentation is available at https://github.com/italia/publiccode-crawler`, diff --git a/crawler/crawler.go b/crawler/crawler.go index 1b7abf6..bd6f53b 100644 --- a/crawler/crawler.go +++ b/crawler/crawler.go @@ -6,6 +6,7 @@ import ( "net/http" "net/url" "os" + "path" "regexp" "runtime" "strings" @@ -83,18 +84,38 @@ func NewCrawler(dryRun bool) *Crawler { return &c } -// CrawlRepo crawls a single repository (only used by the 'one' command). -func (c *Crawler) CrawlRepo(repoURL url.URL, publisher common.Publisher) error { - log.Infof("Processing repository: %s", repoURL.String()) +// CrawlSoftwareByAPIURL crawls a single software. +func (c *Crawler) CrawlSoftwareByID(software string, publisher common.Publisher) error { + var id string + + softwareURL, err := url.Parse(software) + if err != nil { + id = software + } else { + id = path.Base(softwareURL.Path) + } + + s, err := c.apiClient.GetSoftware(id) + if err != nil { + return err + } + + s.URL = strings.TrimSuffix(s.URL, ".git") + + repoURL, err := url.Parse(s.URL) + if err != nil { + return err + } + + log.Infof("Processing repository: %s", softwareURL.String()) - var err error switch { - case vcsurl.IsGitHub(&repoURL): - err = c.gitHubScanner.ScanRepo(repoURL, publisher, c.repositories) - case vcsurl.IsBitBucket(&repoURL): - err = c.bitBucketScanner.ScanRepo(repoURL, publisher, c.repositories) - case vcsurl.IsGitLab(&repoURL): - err = c.gitLabScanner.ScanRepo(repoURL, publisher, c.repositories) + case vcsurl.IsGitHub(repoURL): + err = c.gitHubScanner.ScanRepo(*repoURL, publisher, c.repositories) + case vcsurl.IsBitBucket(repoURL): + err = c.bitBucketScanner.ScanRepo(*repoURL, publisher, c.repositories) + case vcsurl.IsGitLab(repoURL): + err = c.gitLabScanner.ScanRepo(*repoURL, publisher, c.repositories) default: err = fmt.Errorf( "publisher %s: unsupported code hosting platform for %s",