From 6b672a4d1b7748fa2244b819ae853552b9028a7c Mon Sep 17 00:00:00 2001 From: Simone Basso Date: Wed, 14 Feb 2024 11:32:34 +0100 Subject: [PATCH 1/3] fix(gardener): avoid issues caused by stale databases We use a database to keep measurements results to allow to interrupt and resume the work, given that we have thousands or URLs to vet. However, after a week, the database is most likely stale, so let's just avoid trusting and start afresh in such a case. It's reasonable to assume that the database is stale after such an interval because the URL's status may have changed (e.g., some URLs could have stopped working). Additionally, every time we run `gardener sync` we most likely have new URLs, so we need to do one of the following: 1. merge new URLs into the database; 2. just zap the database and start afresh. Because doing 1. would be a bit time consuming, for now I am opting for doing 2. and we'll implement 1. if we really need it. --- .../gardener/internal/dnsreport/dnsreport.go | 24 ++++++++++++++----- internal/cmd/gardener/internal/sync/sync.go | 11 +++++++++ .../cmd/gardener/internal/sync/sync_test.go | 11 +++++---- internal/cmd/gardener/main.go | 9 +++---- internal/fsx/fsx.go | 7 +++--- 5 files changed, 45 insertions(+), 17 deletions(-) diff --git a/internal/cmd/gardener/internal/dnsreport/dnsreport.go b/internal/cmd/gardener/internal/dnsreport/dnsreport.go index daa5fb7229..ae980b7e81 100644 --- a/internal/cmd/gardener/internal/dnsreport/dnsreport.go +++ b/internal/cmd/gardener/internal/dnsreport/dnsreport.go @@ -7,6 +7,7 @@ import ( "encoding/csv" "encoding/json" "fmt" + "io/fs" "net" "net/url" "os" @@ -51,20 +52,31 @@ type Subcommand struct { // loadedFromRepository counts the number of times we loaded from the repository var loadedFromRepository = &atomic.Int64{} +// databaseIsGood returns whether we should use the existing database content. You should +// pass to this function the results of os.Stat invoked on the database file path. +func databaseIsGood(statbuf fs.FileInfo, err error) bool { + const recreateInterval = 7 * 24 * time.Hour + return err == nil && fsx.IsRegular(statbuf) && time.Since(statbuf.ModTime()) < recreateInterval +} + // Main is the main function of the dnsreport subcommand. This function calls // [runtimex.PanicOnError] in case of failure. func (s *Subcommand) Main(ctx context.Context) { - // check whether the database exists - dbExists := fsx.RegularFileExists(s.Database) + // check whether the database exists and check its statistics + isGood := databaseIsGood(os.Stat(s.Database)) + + // if the database is not good, truncate it and restart + if !isGood { + log.Infof("rm -rf %s", s.Database) + _ = os.Remove(s.Database) + } // create or open the underlying sqlite3 database db := s.createOrOpenDatabase() defer db.Close() - // if the database has just been created, then import - // the URLs from the locally cloned git repository, otherwise - // keep using the existing database file - if !dbExists { + // fill again the database if what we had before was not good + if !isGood { log.Infof("creating new %s database", s.Database) s.loadFromRepository(db) loadedFromRepository.Add(1) diff --git a/internal/cmd/gardener/internal/sync/sync.go b/internal/cmd/gardener/internal/sync/sync.go index fdb81873e4..1f18cd00e6 100644 --- a/internal/cmd/gardener/internal/sync/sync.go +++ b/internal/cmd/gardener/internal/sync/sync.go @@ -16,6 +16,10 @@ const testListsRepo = "https://github.com/citizenlab/test-lists" // Subcommand is the sync subcommand. The zero value is invalid; please, make // sure you initialize all the fields marked as MANDATORY. type Subcommand struct { + // DNSReportDatabase is the MANDATORY file containing the `dnsreport` database + // that we're currently using to avoid repeating measurements. + DNSReportDatabase string + // RepositoryDir is the MANDATORY directory where to clone the test lists repository. RepositoryDir string @@ -35,6 +39,13 @@ func (s *Subcommand) Main() { // possibly remove a previous working copy runtimex.Try0(shellx.Run(log.Log, "rm", "-rf", s.RepositoryDir)) + // possibly remove an existing dnsreport.sqlite3 database + // + // TODO(bassosimone): an alternative would be to somehow take note of the fact + // that the database needs merging from an updated repository, but doing that + // would require us to write a more complex diff. + runtimex.Try0(shellx.Run(log.Log, "rm", "-rf", s.DNSReportDatabase)) + // clone a new working copy runtimex.Try0(shellx.Run(log.Log, "git", "clone", testListsRepo, s.RepositoryDir)) diff --git a/internal/cmd/gardener/internal/sync/sync_test.go b/internal/cmd/gardener/internal/sync/sync_test.go index d9fe79f881..605f2020c1 100644 --- a/internal/cmd/gardener/internal/sync/sync_test.go +++ b/internal/cmd/gardener/internal/sync/sync_test.go @@ -68,11 +68,13 @@ func TestWorkingAsIntended(t *testing.T) { // create the subcommand instance repodir := filepath.Join("testdata", "repo") + dnsreportfile := filepath.Join("testdata", "dnsreport.sqlite3") sc := &sync.Subcommand{ - RepositoryDir: repodir, - OsChdir: cc.Chdir, - OsGetwd: cc.Getwd, - TimeNow: cc.TimeNow, + DNSReportDatabase: dnsreportfile, + RepositoryDir: repodir, + OsChdir: cc.Chdir, + OsGetwd: cc.Getwd, + TimeNow: cc.TimeNow, } // run the subcommand with custom shellx dependencies @@ -83,6 +85,7 @@ func TestWorkingAsIntended(t *testing.T) { // expectations for commands expect := []string{ fmt.Sprintf("rm -rf %s", repodir), + fmt.Sprintf("rm -rf %s", dnsreportfile), fmt.Sprintf("git clone https://github.com/citizenlab/test-lists %s", repodir), fmt.Sprintf("cd %s", repodir), "git checkout -b gardener_20230315T114300Z", diff --git a/internal/cmd/gardener/main.go b/internal/cmd/gardener/main.go index c6a201e2ea..e9b5478c31 100644 --- a/internal/cmd/gardener/main.go +++ b/internal/cmd/gardener/main.go @@ -39,10 +39,11 @@ func main() { Args: cobra.NoArgs, Run: func(cmd *cobra.Command, args []string) { sc := &sync.Subcommand{ - RepositoryDir: repositoryDir, - OsChdir: os.Chdir, - OsGetwd: os.Getwd, - TimeNow: time.Now, + DNSReportDatabase: "dnsreport.sqlite3", + RepositoryDir: repositoryDir, + OsChdir: os.Chdir, + OsGetwd: os.Getwd, + TimeNow: time.Now, } sc.Main() }, diff --git a/internal/fsx/fsx.go b/internal/fsx/fsx.go index 48fe05bfc6..6ee374dfab 100644 --- a/internal/fsx/fsx.go +++ b/internal/fsx/fsx.go @@ -32,7 +32,7 @@ func openWithFS(fs fs.FS, pathname string) (fs.File, error) { file.Close() return nil, err } - if !isRegular(info) { + if !IsRegular(info) { file.Close() return nil, fmt.Errorf("%w: %s", ErrNotRegularFile, pathname) } @@ -47,7 +47,8 @@ func (filesystem) Open(pathname string) (fs.File, error) { return os.Open(pathname) } -func isRegular(info fs.FileInfo) bool { +// IsRegular returns whether a file is a regular file. +func IsRegular(info fs.FileInfo) bool { return info.Mode().IsRegular() } @@ -58,7 +59,7 @@ func RegularFileExists(filename string) bool { if err != nil { return false } - return isRegular(finfo) + return IsRegular(finfo) } // DirectoryExists returns whether the given filename From 31da77892fa91e58fe49bbf66bdefb973a1c726f Mon Sep 17 00:00:00 2001 From: Simone Basso Date: Wed, 14 Feb 2024 11:51:38 +0100 Subject: [PATCH 2/3] x --- internal/cmd/gardener/main.go | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/internal/cmd/gardener/main.go b/internal/cmd/gardener/main.go index e9b5478c31..8ba0072254 100644 --- a/internal/cmd/gardener/main.go +++ b/internal/cmd/gardener/main.go @@ -21,6 +21,9 @@ import ( // repositoryDir is the path of the citizenlab/test-lists working copy. const repositoryDir = "citizenlab-test-lists" +// dnsReportDatabase is the path of the database maintained by the dnsreport subcommand. +const dnsReportDatabase = "dnsreport.sqlite3" + func main() { // select a colourful apex/log handler log.SetHandler(cli.New(os.Stderr)) @@ -39,7 +42,7 @@ func main() { Args: cobra.NoArgs, Run: func(cmd *cobra.Command, args []string) { sc := &sync.Subcommand{ - DNSReportDatabase: "dnsreport.sqlite3", + DNSReportDatabase: dnsReportDatabase, RepositoryDir: repositoryDir, OsChdir: os.Chdir, OsGetwd: os.Getwd, @@ -59,7 +62,7 @@ func main() { sc := &dnsreport.Subcommand{ APIURL: "https://api.ooni.io", DNSOverHTTPSServerURL: "https://dns.google/dns-query", - Database: "dnsreport.sqlite3", + Database: dnsReportDatabase, ReportFile: "dnsreport.csv", RepositoryDir: repositoryDir, } From eef679b3ee82699d601255d0b4448f6a39d69b3e Mon Sep 17 00:00:00 2001 From: Simone Basso Date: Wed, 14 Feb 2024 11:58:14 +0100 Subject: [PATCH 3/3] fix: avoid using rm -rf for a regular file If dnsreport.sqlite3 isn't regular, then there's something deeply unexpected with the file system and we wouldn't want to zap something that may be precious to the user. --- internal/cmd/gardener/internal/dnsreport/dnsreport.go | 2 +- internal/cmd/gardener/internal/sync/sync.go | 2 +- internal/cmd/gardener/internal/sync/sync_test.go | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/internal/cmd/gardener/internal/dnsreport/dnsreport.go b/internal/cmd/gardener/internal/dnsreport/dnsreport.go index ae980b7e81..d5edc649c8 100644 --- a/internal/cmd/gardener/internal/dnsreport/dnsreport.go +++ b/internal/cmd/gardener/internal/dnsreport/dnsreport.go @@ -67,7 +67,7 @@ func (s *Subcommand) Main(ctx context.Context) { // if the database is not good, truncate it and restart if !isGood { - log.Infof("rm -rf %s", s.Database) + log.Infof("rm -f %s", s.Database) _ = os.Remove(s.Database) } diff --git a/internal/cmd/gardener/internal/sync/sync.go b/internal/cmd/gardener/internal/sync/sync.go index 1f18cd00e6..c5f1360f1f 100644 --- a/internal/cmd/gardener/internal/sync/sync.go +++ b/internal/cmd/gardener/internal/sync/sync.go @@ -44,7 +44,7 @@ func (s *Subcommand) Main() { // TODO(bassosimone): an alternative would be to somehow take note of the fact // that the database needs merging from an updated repository, but doing that // would require us to write a more complex diff. - runtimex.Try0(shellx.Run(log.Log, "rm", "-rf", s.DNSReportDatabase)) + runtimex.Try0(shellx.Run(log.Log, "rm", "-f", s.DNSReportDatabase)) // clone a new working copy runtimex.Try0(shellx.Run(log.Log, "git", "clone", testListsRepo, s.RepositoryDir)) diff --git a/internal/cmd/gardener/internal/sync/sync_test.go b/internal/cmd/gardener/internal/sync/sync_test.go index 605f2020c1..dbbba8df41 100644 --- a/internal/cmd/gardener/internal/sync/sync_test.go +++ b/internal/cmd/gardener/internal/sync/sync_test.go @@ -85,7 +85,7 @@ func TestWorkingAsIntended(t *testing.T) { // expectations for commands expect := []string{ fmt.Sprintf("rm -rf %s", repodir), - fmt.Sprintf("rm -rf %s", dnsreportfile), + fmt.Sprintf("rm -f %s", dnsreportfile), fmt.Sprintf("git clone https://github.com/citizenlab/test-lists %s", repodir), fmt.Sprintf("cd %s", repodir), "git checkout -b gardener_20230315T114300Z",