diff --git a/internal/enginelocate/iplookup.go b/internal/enginelocate/iplookup.go index 42c6411f74..3693b0a112 100644 --- a/internal/enginelocate/iplookup.go +++ b/internal/enginelocate/iplookup.go @@ -77,13 +77,21 @@ func makeSlice() []method { return ret } +func contextForIPLookupWithTimeout(ctx context.Context) (context.Context, context.CancelFunc) { + // TODO(https://github.com/ooni/probe/issues/2551): we must enforce a timeout this + // large to ensure we give all resolvers a chance to run. We set this value as part of + // an hotfix. The above mentioned issue explains how to improve the situation and + // avoid the need of setting such large timeouts here. + const timeout = 45 * time.Second + return context.WithTimeout(ctx, timeout) +} + func (c ipLookupClient) doWithCustomFunc( ctx context.Context, fn lookupFunc, ) (string, error) { - // Reliability fix: let these mechanisms timeout earlier. - const timeout = 7 * time.Second - ctx, cancel := context.WithTimeout(ctx, timeout) + ctx, cancel := contextForIPLookupWithTimeout(ctx) defer cancel() + // Implementation note: we MUST use an HTTP client that we're // sure IS NOT using any proxy. To this end, we construct a // client ourself that we know is not proxied. diff --git a/internal/enginelocate/iplookup_test.go b/internal/enginelocate/iplookup_test.go index e3c11d6935..f8c171c6ce 100644 --- a/internal/enginelocate/iplookup_test.go +++ b/internal/enginelocate/iplookup_test.go @@ -5,6 +5,7 @@ import ( "errors" "net" "testing" + "time" "github.com/apex/log" "github.com/ooni/probe-cli/v3/internal/model" @@ -55,3 +56,19 @@ func TestIPLookupInvalidIP(t *testing.T) { t.Fatal("expected the default IP here") } } + +func TestContextForIPLookupWithTimeout(t *testing.T) { + now := time.Now() + ctx, cancel := contextForIPLookupWithTimeout(context.Background()) + defer cancel() + deadline, okay := ctx.Deadline() + if !okay { + t.Fatal("the context does not have a deadline") + } + delta := deadline.Sub(now) + // Note: super conservative check. Assume it may take up to five seconds + // for the code to create a context, which is totally unrealistic. + if delta < 40*time.Second { + t.Fatal("the deadline is too short") + } +} diff --git a/internal/enginelocate/stun.go b/internal/enginelocate/stun.go index b0600b65e5..848e29a83d 100644 --- a/internal/enginelocate/stun.go +++ b/internal/enginelocate/stun.go @@ -29,6 +29,7 @@ func stunNewClient(conn net.Conn) (stunClient, error) { func stunIPLookup(ctx context.Context, config stunConfig) (string, error) { config.Logger.Debugf("STUNIPLookup: start using %s", config.Endpoint) + ip, err := func() (string, error) { dialer := config.Dialer if dialer == nil { @@ -74,6 +75,7 @@ func stunIPLookup(ctx context.Context, config stunConfig) (string, error) { return model.DefaultProbeIP, ctx.Err() } }() + if err != nil { config.Logger.Debugf("STUNIPLookup: failure using %s: %+v", config.Endpoint, err) return model.DefaultProbeIP, err diff --git a/internal/engineresolver/doc.go b/internal/engineresolver/doc.go index a1681fc640..c0cdb2d990 100644 --- a/internal/engineresolver/doc.go +++ b/internal/engineresolver/doc.go @@ -13,8 +13,8 @@ // have any preferential ordering. The initial resolutions may be slower // if there are many issues with resolvers. // -// The system resolver is given the lowest priority at the beginning -// but it will of course be the most popular resolver if anything else +// The system resolver is given intermediate priority at the beginning (i.e., +// 0.5) but it will of course be the most popular resolver if anything else // is failing us. (We will still occasionally probe for other working // resolvers and increase their score on success.) // diff --git a/internal/engineresolver/resolver.go b/internal/engineresolver/resolver.go index 4647061d70..51ddf5f2a7 100644 --- a/internal/engineresolver/resolver.go +++ b/internal/engineresolver/resolver.go @@ -109,6 +109,16 @@ func (r *Resolver) LookupHost(ctx context.Context, hostname string) ([]string, e r.logger().Infof("sessionresolver: skipping with proxy: %+v", e) continue // we cannot proxy this URL so ignore it } + + // Hotfix: if the context has been canceled from the outside avoid + // doing a dnslookup, which would mark the resolver as not WAI. + // + // See https://github.com/ooni/probe/issues/2544 + if err := ctx.Err(); err != nil { + me.Add(newErrWrapper(err, e.URL)) + continue + } + addrs, err := r.lookupHost(ctx, e, hostname) if err == nil { return addrs, nil diff --git a/internal/engineresolver/resolver_test.go b/internal/engineresolver/resolver_test.go index 5565dfc138..6db180ab0f 100644 --- a/internal/engineresolver/resolver_test.go +++ b/internal/engineresolver/resolver_test.go @@ -34,7 +34,7 @@ func TestAddressWorks(t *testing.T) { } } -func TestTypicalUsageWithFailure(t *testing.T) { +func TestResolverLookupHostUsingACanceledContext(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) cancel() // fail immediately reso := &Resolver{KVStore: &kvstore.Memory{}} @@ -47,6 +47,7 @@ func TestTypicalUsageWithFailure(t *testing.T) { t.Fatal("cannot convert error") } for _, child := range me.Children { + // net.DNSError does not include the underlying error // but just a string representing the error. This // means that we need to go down hunting what's the @@ -54,7 +55,7 @@ func TestTypicalUsageWithFailure(t *testing.T) { { var ew *errWrapper if !errors.As(child, &ew) { - t.Fatal("not an instance of errwrapper") + t.Fatalf("not an instance of errwrapper: '%v' [%T]", child, child) } var de *net.DNSError if errors.As(ew, &de) { @@ -64,6 +65,7 @@ func TestTypicalUsageWithFailure(t *testing.T) { continue } } + // otherwise just unwrap and check whether it's // a real context.Canceled error. if !errors.Is(child, context.Canceled) { @@ -73,9 +75,22 @@ func TestTypicalUsageWithFailure(t *testing.T) { if addrs != nil { t.Fatal("expected nil here") } - if len(reso.res) < 1 { - t.Fatal("expected to see some resolvers here") + + // Since https://github.com/ooni/probe-cli/pull/1351 we avoid + // constructing any resolver if we start running with a canceled context + // because of https://github.com/ooni/probe/issues/2544. + // + // In other words, as long as we see zero here we can be confident + // that we're not creating a resolver if the context has been canceled, + // which should imply we're not changing its score. + // + // Heavier refactoring of this package should probably more aggressively + // ensure that we're not changing the score, but for now this test + // is sufficient given that we are committing an hotfix. + if len(reso.res) != 0 { + t.Fatal("expected to see no resolvers here") } + reso.CloseIdleConnections() if len(reso.res) != 0 { t.Fatal("expected to see no resolvers after CloseIdleConnections") diff --git a/internal/engineresolver/resolvermaker.go b/internal/engineresolver/resolvermaker.go index 6f5fe363bd..5f42acf511 100644 --- a/internal/engineresolver/resolvermaker.go +++ b/internal/engineresolver/resolvermaker.go @@ -43,21 +43,29 @@ var allmakers = []*resolvermaker{{ }} // allbyurl contains all the resolvermakers by URL -var allbyurl map[string]*resolvermaker +var allbyurl = resolverMakeInitialState() -// init fills allbyname and gives a nonzero initial score +// resolverMakeInitialState initializes the initial +// state by giving a nonzero initial score // to all resolvers except for the system resolver. We set -// the system resolver score to zero, so that it's less +// the system resolver score to be 0.5, so that it's less // likely than other resolvers in this list. -func init() { - allbyurl = make(map[string]*resolvermaker) +// +// We used to set this value to 0, but this proved to +// create issues when it was the only available resolver, +// see https://github.com/ooni/probe/issues/2544. +func resolverMakeInitialState() map[string]*resolvermaker { + output := make(map[string]*resolvermaker) rng := rand.New(rand.NewSource(time.Now().UnixNano())) for _, e := range allmakers { - allbyurl[e.url] = e + output[e.url] = e if e.url != systemResolverURL { e.score = rng.Float64() + } else { + e.score = 0.5 } } + return output } // logger returns the configured logger or a default diff --git a/internal/engineresolver/resolvermaker_test.go b/internal/engineresolver/resolvermaker_test.go index 9889c0f977..d2de4e8a5c 100644 --- a/internal/engineresolver/resolvermaker_test.go +++ b/internal/engineresolver/resolvermaker_test.go @@ -121,3 +121,19 @@ func TestGetResolverHTTP3(t *testing.T) { t.Fatal("expected true") } } + +func TestResolverMakeInitialState(t *testing.T) { + t.Run("the system resolver has a default score of 0.5", func(t *testing.T) { + state := resolverMakeInitialState() + var okay bool + for URL, entry := range state { + t.Logf("entry: %v %v", URL, *entry) + if URL == systemResolverURL && entry.score == 0.5 { + okay = true + } + } + if !okay { + t.Fatal("expected to see the system resolver with 0.5 as its score") + } + }) +}