Add faster availability DNS probe and update test domain to .com (#2280)

* Add faster availability DNS probe and update test domain to .com

- Count success queries and compare it before doing after network map probes.

- Reduce the first dns probe to 500ms

- Updated test domain with com instead of . due to Palo alto DNS proxy server issues

* use fqdn

* Update client/internal/dns/upstream.go

Co-authored-by: Viktor Liu <17948409+lixmal@users.noreply.github.com>

---------

Co-authored-by: Viktor Liu <17948409+lixmal@users.noreply.github.com>
This commit is contained in:
Maycon Santos 2024-07-17 23:48:37 +02:00 committed by GitHub
parent e78ec2e985
commit 19147f518e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -24,7 +24,7 @@ const (
probeTimeout = 2 * time.Second probeTimeout = 2 * time.Second
) )
const testRecord = "." const testRecord = "com."
type upstreamClient interface { type upstreamClient interface {
exchange(ctx context.Context, upstream string, r *dns.Msg) (*dns.Msg, time.Duration, error) exchange(ctx context.Context, upstream string, r *dns.Msg) (*dns.Msg, time.Duration, error)
@ -42,6 +42,7 @@ type upstreamResolverBase struct {
upstreamServers []string upstreamServers []string
disabled bool disabled bool
failsCount atomic.Int32 failsCount atomic.Int32
successCount atomic.Int32
failsTillDeact int32 failsTillDeact int32
mutex sync.Mutex mutex sync.Mutex
reactivatePeriod time.Duration reactivatePeriod time.Duration
@ -124,6 +125,7 @@ func (u *upstreamResolverBase) ServeDNS(w dns.ResponseWriter, r *dns.Msg) {
return return
} }
u.successCount.Add(1)
log.Tracef("took %s to query the upstream %s", t, upstream) log.Tracef("took %s to query the upstream %s", t, upstream)
err = w.WriteMsg(rm) err = w.WriteMsg(rm)
@ -172,6 +174,11 @@ func (u *upstreamResolverBase) probeAvailability() {
default: default:
} }
// avoid probe if upstreams could resolve at least one query and fails count is less than failsTillDeact
if u.successCount.Load() > 0 && u.failsCount.Load() < u.failsTillDeact {
return
}
var success bool var success bool
var mu sync.Mutex var mu sync.Mutex
var wg sync.WaitGroup var wg sync.WaitGroup
@ -183,7 +190,7 @@ func (u *upstreamResolverBase) probeAvailability() {
wg.Add(1) wg.Add(1)
go func() { go func() {
defer wg.Done() defer wg.Done()
err := u.testNameserver(upstream) err := u.testNameserver(upstream, 500*time.Millisecond)
if err != nil { if err != nil {
errors = multierror.Append(errors, err) errors = multierror.Append(errors, err)
log.Warnf("probing upstream nameserver %s: %s", upstream, err) log.Warnf("probing upstream nameserver %s: %s", upstream, err)
@ -224,7 +231,7 @@ func (u *upstreamResolverBase) waitUntilResponse() {
} }
for _, upstream := range u.upstreamServers { for _, upstream := range u.upstreamServers {
if err := u.testNameserver(upstream); err != nil { if err := u.testNameserver(upstream, probeTimeout); err != nil {
log.Tracef("upstream check for %s: %s", upstream, err) log.Tracef("upstream check for %s: %s", upstream, err)
} else { } else {
// at least one upstream server is available, stop probing // at least one upstream server is available, stop probing
@ -244,6 +251,7 @@ func (u *upstreamResolverBase) waitUntilResponse() {
log.Infof("upstreams %s are responsive again. Adding them back to system", u.upstreamServers) log.Infof("upstreams %s are responsive again. Adding them back to system", u.upstreamServers)
u.failsCount.Store(0) u.failsCount.Store(0)
u.successCount.Add(1)
u.reactivate() u.reactivate()
u.disabled = false u.disabled = false
} }
@ -265,13 +273,14 @@ func (u *upstreamResolverBase) disable(err error) {
} }
log.Warnf("Upstream resolving is Disabled for %v", reactivatePeriod) log.Warnf("Upstream resolving is Disabled for %v", reactivatePeriod)
u.successCount.Store(0)
u.deactivate(err) u.deactivate(err)
u.disabled = true u.disabled = true
go u.waitUntilResponse() go u.waitUntilResponse()
} }
func (u *upstreamResolverBase) testNameserver(server string) error { func (u *upstreamResolverBase) testNameserver(server string, timeout time.Duration) error {
ctx, cancel := context.WithTimeout(u.ctx, probeTimeout) ctx, cancel := context.WithTimeout(u.ctx, timeout)
defer cancel() defer cancel()
r := new(dns.Msg).SetQuestion(testRecord, dns.TypeSOA) r := new(dns.Msg).SetQuestion(testRecord, dns.TypeSOA)