Skip to content

Commit

Permalink
feat!: add crawl-software command and remove one
Browse files Browse the repository at this point in the history
Add a new `crawl-software` command that replaces the non functional
`one` command.

Fix #122.
  • Loading branch information
bfabio committed Jun 10, 2024
1 parent 4feb1ae commit 6de3bda
Show file tree
Hide file tree
Showing 7 changed files with 116 additions and 95 deletions.
17 changes: 12 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,15 @@
Developers Italia provides [a catalog of Free and Open Source](https://developers.italia.it/en/search)
software aimed to Public Administrations.

This **crawler** retrieves the `publiccode.yml` files from the
`publiccode-crawler` retrieves the `publiccode.yml` files from the
repositories of publishers found in the [Developers Italia API](https://github.com/italia/developers-italia-api).

## Setup and deployment processes

The crawler can either run manually on the target machine or it can be deployed
`publiccode-crawler` can either run manually on the target machine or it can be deployed
from a Docker container.

### Manually configure and build the crawler
### Manually configure and build

1. Rename `config.toml.example` to `config.toml` and set the variables

Expand All @@ -43,16 +43,23 @@ docker run -it italia/publiccode-crawler

## Commands

### `crawler crawl`
### `publiccode-crawler crawl`

Gets the list of publishers from `https://api.developers.italia.it/v1/publishers`
and starts to crawl their repositories.

### `crawler crawl publishers*.yml`
### `publiccode-crawler crawl publishers*.yml`

Gets the list of publishers in `publishers*.yml` and starts to crawl
their repositories.

### `publiccode-crawler crawl-software <software> <publisher>`

Crawl just the software specified as paramenter.
It takes the software URL and its publisher id as parameters.

Ex. `publiccode-crawler crawl-software https://api.developers.italia.it/v1/software/a2ea59b0-87cd-4419-b93f-00bed8a7b859 edb66b3d-3e36-4b69-aba9-b7c4661b3fdd"

### Other commands

* `crawler download-publishers` downloads organizations and repositories from
Expand Down
19 changes: 19 additions & 0 deletions apiclient/apiclient.go
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,25 @@ page:
return publishers, nil
}

// GetSoftware returns the software with the given id or any error encountered.
func (clt APIClient) GetSoftware(id string) (*Software, error) {
var softwareResponse Software

res, err := clt.retryableClient.Get(joinPath(clt.baseURL, "/software") + "/" + id)
if err != nil {
return nil, fmt.Errorf("can't GET /software/%s: %w", id, err)
}

defer res.Body.Close()

err = json.NewDecoder(res.Body).Decode(&softwareResponse)
if err != nil {
return nil, fmt.Errorf("can't parse GET /software/%s response: %w", id, err)
}

return &softwareResponse, nil
}

// GetSoftwareByURL returns the software matching the given repo URL and
// any error encountered.
// In case no software is found and no error occours, (nil, nil) is returned.
Expand Down
43 changes: 43 additions & 0 deletions cmd/crawl-software.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
package cmd

import (
"github.com/italia/publiccode-crawler/v4/common"
"github.com/italia/publiccode-crawler/v4/crawler"
log "github.com/sirupsen/logrus"
"github.com/spf13/cobra"
"github.com/spf13/viper"
)

func init() {
crawlSoftwareCmd.Flags().BoolVarP(&dryRun, "dry-run", "n", false, "perform a dry run with no changes made")

rootCmd.AddCommand(crawlSoftwareCmd)
}

var crawlSoftwareCmd = &cobra.Command{
Use: "crawl-software [SOFTWARE_ID | SOFTWARE_URL] PUBLISHER_ID",
Short: "Crawl a single software by its id.",
Long: `Crawl a single software by its id.
Crawl a single software given its API id and its publisher.`,
Example: "# Crawl just the specified software\n" +
"publiccode-crawler crawl-software" +
" https://api.developers.italia.it/v1/software/af6056fc-b2b2-4d31-9961-c9bd94e32bd4 PCM",

Args: cobra.ExactArgs(2),
Run: func(_ *cobra.Command, args []string) {
if token := viper.GetString("GITHUB_TOKEN"); token == "" {
log.Fatal("Please set GITHUB_TOKEN, it's needed to use the GitHub API'")
}

c := crawler.NewCrawler(dryRun)

publisher := common.Publisher{
ID: args[1],
}

if err := c.CrawlSoftwareByID(args[0], publisher); err != nil {
log.Fatal(err)
}
},
}
11 changes: 10 additions & 1 deletion cmd/crawl.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,21 @@ func init() {
}

var crawlCmd = &cobra.Command{
Use: "crawl publishers.yml [directory/*.yml ...]",
Use: "crawl [publishers.yml] [directory/*.yml ...]",
Short: "Crawl publiccode.yml files in publishers' repos.",
Long: `Crawl publiccode.yml files in publishers' repos.
When run with no arguments, the publishers are fetched from the API,
otherwise the passed YAML files are used.`,
Example: `
# Crawl publishers fetched from the API
crawl
# Crawl using a specific publishers.yml file
crawl publishers.yml
# Crawl all YAML files in a specific directory
crawl directory/*.yml`,

Args: cobra.MinimumNArgs(0),
Run: func(_ *cobra.Command, args []string) {
Expand Down
78 changes: 0 additions & 78 deletions cmd/one.go

This file was deleted.

2 changes: 1 addition & 1 deletion cmd/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import (
var (
dryRun bool
rootCmd = &cobra.Command{
Use: "crawler",
Use: "publiccode-crawler",
Short: "A crawler for publiccode.yml files.",
Long: `A fast and robust publiccode.yml file crawler.
Complete documentation is available at https://github.com/italia/publiccode-crawler`,
Expand Down
41 changes: 31 additions & 10 deletions crawler/crawler.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"net/http"
"net/url"
"os"
"path"
"regexp"
"runtime"
"strings"
Expand Down Expand Up @@ -83,18 +84,38 @@ func NewCrawler(dryRun bool) *Crawler {
return &c
}

// CrawlRepo crawls a single repository (only used by the 'one' command).
func (c *Crawler) CrawlRepo(repoURL url.URL, publisher common.Publisher) error {
log.Infof("Processing repository: %s", repoURL.String())
// CrawlSoftwareByAPIURL crawls a single software.
func (c *Crawler) CrawlSoftwareByID(software string, publisher common.Publisher) error {
var id string

softwareURL, err := url.Parse(software)
if err != nil {
id = software
} else {
id = path.Base(softwareURL.Path)
}

s, err := c.apiClient.GetSoftware(id)
if err != nil {
return err
}

s.URL = strings.TrimSuffix(s.URL, ".git")

repoURL, err := url.Parse(s.URL)
if err != nil {
return err
}

log.Infof("Processing repository: %s", softwareURL.String())

var err error
switch {
case vcsurl.IsGitHub(&repoURL):
err = c.gitHubScanner.ScanRepo(repoURL, publisher, c.repositories)
case vcsurl.IsBitBucket(&repoURL):
err = c.bitBucketScanner.ScanRepo(repoURL, publisher, c.repositories)
case vcsurl.IsGitLab(&repoURL):
err = c.gitLabScanner.ScanRepo(repoURL, publisher, c.repositories)
case vcsurl.IsGitHub(repoURL):
err = c.gitHubScanner.ScanRepo(*repoURL, publisher, c.repositories)
case vcsurl.IsBitBucket(repoURL):
err = c.bitBucketScanner.ScanRepo(*repoURL, publisher, c.repositories)
case vcsurl.IsGitLab(repoURL):
err = c.gitLabScanner.ScanRepo(*repoURL, publisher, c.repositories)
default:
err = fmt.Errorf(
"publisher %s: unsupported code hosting platform for %s",
Expand Down

0 comments on commit 6de3bda

Please sign in to comment.