From c942f6b2fca522962f6300bc2bc6388b872046c4 Mon Sep 17 00:00:00 2001 From: Jay Taylor Date: Thu, 16 Jan 2025 16:46:20 -0800 Subject: [PATCH] API: Sandbox routing: Use Redis for client-proxy DNS instead of local map. --- .terraform.lock.hcl | 14 ++- packages/api/internal/dns/server.go | 87 ++++++++++++++----- packages/api/internal/orchestrator/cache.go | 8 +- .../api/internal/orchestrator/orchestrator.go | 47 ++++++---- packages/nomad/main.tf | 10 +++ packages/nomad/redis.hcl | 64 ++++++++++++++ 6 files changed, 185 insertions(+), 45 deletions(-) create mode 100644 packages/nomad/redis.hcl diff --git a/.terraform.lock.hcl b/.terraform.lock.hcl index 1dff6d2cc..01e9e742c 100644 --- a/.terraform.lock.hcl +++ b/.terraform.lock.hcl @@ -3,9 +3,10 @@ provider "registry.terraform.io/cloudflare/cloudflare" { version = "4.19.0" - constraints = "~> 4.0" + constraints = "4.19.0" hashes = [ "h1:3EZM8zAObdA81PcyXyiic4y2aZsTowYKG29RjZKXbJU=", + "h1:JLYiArMIeU5gf/2jEbyijUBxuR1BeJCZRjUwe2bkr44=", "zh:1d5315dcbd8187a3a978dc1fb08e80b6cdd353de10afe531b3d1ecb834d0dbae", "zh:2a6e5b2e5072e442b35ce6142172e15afb835e16799d04a0054a79d3067f7560", "zh:308c5690024a1f6797300018456a1ac781c8699fa4bc4892a8c36eb992604a26", @@ -27,6 +28,7 @@ provider "registry.terraform.io/cloudflare/cloudflare" { provider "registry.terraform.io/hashicorp/external" { version = "2.3.4" hashes = [ + "h1:U6W8rgrdmR2pZ2cicFoGOSQ4GXuIf/4EK7s0vTJN7is=", "h1:cCabxnWQ5fX1lS7ZqgUzsvWmKZw9FA7NRxAZ94vcTcc=", "zh:037fd82cd86227359bc010672cd174235e2d337601d4686f526d0f53c87447cb", "zh:0ea1db63d6173d01f2fa8eb8989f0809a55135a0d8d424b08ba5dabad73095fa", @@ -47,6 +49,7 @@ provider "registry.terraform.io/hashicorp/google" { version = "5.31.0" constraints = ">= 4.50.0, 5.31.0, < 6.0.0" hashes = [ + "h1:JZkdP8qA+yE7xfZr8c26ahfok1nO5GATIgows3ZYiM0=", "h1:q3xMw4s8/0yP7EiIzcULyANOMF/b/f/gRpACOQi+wCE=", "zh:19f68d33a25c1d79dccb90ddf990e971943eef1de0272858a94e88571bd5792b", "zh:2d65f3c99171c4ff4a7b28d2b2752f1da3303c94ec04279c655dd11743c86da7", @@ -68,6 +71,7 @@ provider "registry.terraform.io/hashicorp/google-beta" { constraints = ">= 4.50.0, 5.31.0, < 6.0.0" hashes = [ "h1:sIXAK1wOIAa7JI4Rrud730rDC3C1+WJl73HOZym7uag=", + "h1:tJ0ZokfEBllpoRM6EuhkKNl77SaphlT3ttr4bIwoL6o=", "zh:11b968dcc9ff336bc39809acb551aca6b385a987953225f1c29170bc85e81a6c", "zh:2f62f0ef17257002d49e1960ec96ffee9ebd6463464d06708a7fa88812e1cf7b", "zh:3a9e41ee9bb4ef152b2a33b6594a425b707b4162ba895a676521cc9a889e5fe4", @@ -87,6 +91,7 @@ provider "registry.terraform.io/hashicorp/nomad" { version = "2.1.0" constraints = "2.1.0" hashes = [ + "h1:5ivj1OVGUWQb6Pv5U8roWWPuANBlmFR/3jnmFiLZ7dc=", "h1:xW88kA3/KMseqV8ISKZW1pQIsnTjZOYb/FtxfdTAWNI=", "zh:39ba4d4fc9557d4d2c1e4bf866cf63973359b73e908cce237c54384512bdb454", "zh:40d2b66e3f3675e6b88000c145977c1d5288510c76b702c6c131d9168546c605", @@ -104,9 +109,11 @@ provider "registry.terraform.io/hashicorp/nomad" { } provider "registry.terraform.io/hashicorp/random" { - version = "3.5.1" + version = "3.5.1" + constraints = ">= 2.1.0, 3.5.1" hashes = [ "h1:IL9mSatmwov+e0+++YX2V6uel+dV6bn+fC/cnGDK3Ck=", + "h1:sZ7MTSD4FLekNN2wSNFGpM+5slfvpm5A/NLVZiB7CO0=", "zh:04e3fbd610cb52c1017d282531364b9c53ef72b6bc533acb2a90671957324a64", "zh:119197103301ebaf7efb91df8f0b6e0dd31e6ff943d231af35ee1831c599188d", "zh:4d2b219d09abf3b1bb4df93d399ed156cadd61f44ad3baf5cf2954df2fba0831", @@ -126,6 +133,7 @@ provider "registry.terraform.io/hashicorp/time" { version = "0.12.1" hashes = [ "h1:JzYsPugN8Fb7C4NlfLoFu7BBPuRVT2/fCOdCaxshveI=", + "h1:j+ED7j0ZFJ4EDx7sdna76wsiIf397toylDN0dFi6v0U=", "zh:090023137df8effe8804e81c65f636dadf8f9d35b79c3afff282d39367ba44b2", "zh:26f1e458358ba55f6558613f1427dcfa6ae2be5119b722d0b3adb27cd001efea", "zh:272ccc73a03384b72b964918c7afeb22c2e6be22460d92b150aaf28f29a7d511", @@ -145,6 +153,7 @@ provider "registry.terraform.io/integrations/github" { version = "5.42.0" constraints = "5.42.0" hashes = [ + "h1:CZUAXhUhMIuIyTPm9VDcvOZgM1Lsl9tuKm5wW9tBEsM=", "h1:rfyLEgbZCk3MMCBuGd4PNFM914vtLqGIYcsmVKr6tdg=", "zh:0f97039c6b70295c4a82347bc8a0bcea700b3fb3df0e0be53585da025584bb7c", "zh:12e78898580cc2a72b5f2a77e191b158f88e974b0500489b691f34842288745c", @@ -168,6 +177,7 @@ provider "registry.terraform.io/kreuzwerker/docker" { constraints = "3.0.2" hashes = [ "h1:XjdpVL61KtTsuPE8swok3GY8A+Bu3TZs8T2DOEpyiXo=", + "h1:os8pBi4rbtFJJtzNWlcGhOVsz5V9UPJvo+L0wNQFYE8=", "zh:15b0a2b2b563d8d40f62f83057d91acb02cd0096f207488d8b4298a59203d64f", "zh:23d919de139f7cd5ebfd2ff1b94e6d9913f0977fcfc2ca02e1573be53e269f95", "zh:38081b3fe317c7e9555b2aaad325ad3fa516a886d2dfa8605ae6a809c1072138", diff --git a/packages/api/internal/dns/server.go b/packages/api/internal/dns/server.go index 98f2b6306..3f3f2f140 100644 --- a/packages/api/internal/dns/server.go +++ b/packages/api/internal/dns/server.go @@ -1,48 +1,81 @@ package dns import ( + "context" "fmt" - "log" "net" "strings" - "sync" + redis "github.com/go-redis/redis/v8" resolver "github.com/miekg/dns" - - "github.com/e2b-dev/infra/packages/shared/pkg/smap" + "go.uber.org/zap" ) const ttl = 0 const defaultRoutingIP = "127.0.0.1" +type FallbackResolverFn = func(sandboxID string) (string, bool) + type DNS struct { - mu sync.Mutex - records *smap.Map[string] + ctx context.Context + rdb *redis.Client + fallbackResolverFn FallbackResolverFn + logger *zap.SugaredLogger } -func New() *DNS { +func New(ctx context.Context, rdbOpts *redis.Options, fallbackResolverFn FallbackResolverFn, logger *zap.SugaredLogger) *DNS { return &DNS{ - records: smap.New[string](), + ctx: ctx, + rdb: redis.NewClient(rdbOpts), + fallbackResolverFn: fallbackResolverFn, + logger: logger, } } -func (d *DNS) Add(sandboxID, ip string) { - d.records.Insert(d.hostname(sandboxID), ip) +func (d *DNS) Add(sandboxID, ip string) error { + d.logger.Infof("DNS: Adding entry, sandboxID=%s -> %s", sandboxID, ip) + if err := d.rdb.Set(d.ctx, d.dnsKeyFor(sandboxID), ip, 86400).Err(); err != nil { + return err + } + return nil } -func (d *DNS) Remove(sandboxID, ip string) { - d.records.RemoveCb(d.hostname(sandboxID), func(key string, v string, exists bool) bool { - return v == ip - }) +func (d *DNS) Remove(sandboxID string) error { + d.logger.Infof("DNS: Removing entry, sandboxID=%s", sandboxID) + if err := d.rdb.Del(d.ctx, d.dnsKeyFor(sandboxID)).Err(); err != nil { + return err + } + return nil } -func (d *DNS) get(hostname string) (string, bool) { - return d.records.Get(hostname) +func (d *DNS) get(sandboxID string) (string, bool) { + res, err := d.rdb.Get(d.ctx, d.dnsKeyFor(sandboxID)).Result() + if err == nil { + return res, true + } + if err != redis.Nil { + d.logger.Warnf("DNS: Redis error getting key for sandbox '%s' (will try fallback resolver..): %s", sandboxID, err) + } + + if d.fallbackResolverFn != nil { + if rec, ok := d.fallbackResolverFn(sandboxID); ok { + d.logger.Infof("DNS: Not found in redis, using fallback lookup for sandbox '%s' succeeded: record=%q", sandboxID, rec) + go func() { + if err := d.Add(sandboxID, rec); err != nil { + d.logger.Errorf("DNS: Problem adding entry: %s", err) + } + }() + return rec, true + } else { + d.logger.Errorf("DNS: Fallback lookup for sandbox '%s' failed", sandboxID) + } + } + return "", false } -func (*DNS) hostname(sandboxID string) string { - return fmt.Sprintf("%s.", sandboxID) +func (d *DNS) dnsKeyFor(sandboxID string) string { + return fmt.Sprintf("dns.%s", sandboxID) } func (d *DNS) handleDNSRequest(w resolver.ResponseWriter, r *resolver.Msg) { @@ -63,8 +96,12 @@ func (d *DNS) handleDNSRequest(w resolver.ResponseWriter, r *resolver.Msg) { } sandboxID := strings.Split(q.Name, "-")[0] - ip, found := d.get(sandboxID) - if found { + // Trim trailing period to facilitate key consistency. + if strings.HasSuffix(sandboxID, ".") { + sandboxID = sandboxID[0 : len(sandboxID)-1] + } + + if ip, found := d.get(sandboxID); found { a.A = net.ParseIP(ip).To4() } else { a.A = net.ParseIP(defaultRoutingIP).To4() @@ -76,7 +113,7 @@ func (d *DNS) handleDNSRequest(w resolver.ResponseWriter, r *resolver.Msg) { err := w.WriteMsg(m) if err != nil { - log.Printf("Failed to write message: %s\n", err.Error()) + d.logger.Errorf("DNS: Failed to write message: %w", err) } } @@ -85,11 +122,15 @@ func (d *DNS) Start(address string, port int) error { mux.HandleFunc(".", d.handleDNSRequest) - server := resolver.Server{Addr: fmt.Sprintf("%s:%d", address, port), Net: "udp", Handler: mux} + server := resolver.Server{ + Addr: fmt.Sprintf("%s:%d", address, port), + Net: "udp", + Handler: mux, + } err := server.ListenAndServe() if err != nil { - return fmt.Errorf("failed to start DNS server: %w", err) + return fmt.Errorf("DNS: failed to start server: %w", err) } return nil diff --git a/packages/api/internal/orchestrator/cache.go b/packages/api/internal/orchestrator/cache.go index 8c1531f26..db0907813 100644 --- a/packages/api/internal/orchestrator/cache.go +++ b/packages/api/internal/orchestrator/cache.go @@ -147,7 +147,9 @@ func (o *Orchestrator) getDeleteInstanceFunction(ctx context.Context, posthogCli node.CPUUsage.Add(-info.VCpu) node.RamUsage.Add(-info.RamMB) - o.dns.Remove(info.Instance.SandboxID, node.Info.IPAddress) + if err := o.dns.Remove(info.Instance.SandboxID); err != nil { + return err + } } req := &orchestrator.SandboxDeleteRequest{SandboxId: info.Instance.SandboxID} @@ -179,7 +181,9 @@ func (o *Orchestrator) getInsertInstanceFunction(ctx context.Context, logger *za node.CPUUsage.Add(info.VCpu) node.RamUsage.Add(info.RamMB) - o.dns.Add(info.Instance.SandboxID, node.Info.IPAddress) + if err := o.dns.Add(info.Instance.SandboxID, node.Info.IPAddress); err != nil { + return err + } } _, err := o.analytics.Client.InstanceStarted(ctx, &analyticscollector.InstanceStartedEvent{ diff --git a/packages/api/internal/orchestrator/orchestrator.go b/packages/api/internal/orchestrator/orchestrator.go index 5935a987e..faf4f9f30 100644 --- a/packages/api/internal/orchestrator/orchestrator.go +++ b/packages/api/internal/orchestrator/orchestrator.go @@ -3,9 +3,9 @@ package orchestrator import ( "context" "errors" - "fmt" "log" + redis "github.com/go-redis/redis/v8" nomadapi "github.com/hashicorp/nomad/api" "go.opentelemetry.io/otel/trace" "go.uber.org/zap" @@ -39,28 +39,12 @@ func New( logger.Errorf("Error initializing Analytics client\n: %v", err) } - dnsServer := dns.New() - - if env.IsLocal() { - fmt.Printf("Running locally, skipping starting DNS server\n") - } else { - go func() { - fmt.Printf("Starting DNS server\n") - - dnsErr := dnsServer.Start("127.0.0.4", 53) - if dnsErr != nil { - log.Fatalf("Failed running DNS server: %v\n", dnsErr) - } - }() - } - o := Orchestrator{ analytics: analyticsInstance, nomadClient: nomadClient, logger: logger, tracer: tracer, nodes: smap.New[*Node](), - dns: dnsServer, } cache := instance.NewCache( @@ -72,9 +56,36 @@ func New( o.instanceCache = cache + rdbOpts := &redis.Options{Addr: "127.0.0.1:6379"} + + fallbackResolverFn := func(sandboxID string) (string, bool) { + for _, apiNode := range o.GetNodes() { + detail := o.GetNodeDetail(apiNode.NodeID) + for _, sb := range detail.Sandboxes { + if sandboxID == sb.SandboxID { + if node := o.GetNode(apiNode.NodeID); node != nil { + return node.Info.IPAddress, true + } + } + } + } + return "", false + } + + o.dns = dns.New(ctx, rdbOpts, fallbackResolverFn, logger) + if env.IsLocal() { - logger.Info("Skipping syncing sandboxes, running locally") + logger.Info("Running locally, skipping starting DNS server") + logger.Info("Running locally, skipping syncing sandboxes") } else { + go func() { + logger.Info("Starting DNS server") + + if err := o.dns.Start("127.0.0.4", 53); err != nil { + log.Fatalf("Failed running DNS server: %v\n", err) + } + }() + go o.keepInSync(cache) } diff --git a/packages/nomad/main.tf b/packages/nomad/main.tf index 29914e5ca..7340b9f58 100644 --- a/packages/nomad/main.tf +++ b/packages/nomad/main.tf @@ -80,6 +80,16 @@ resource "nomad_job" "api" { } } +resource "nomad_job" "redis" { + jobspec = file("${path.module}/redis.hcl") + + hcl2 { + vars = { + gcp_zone = var.gcp_zone + } + } +} + resource "nomad_job" "docker_reverse_proxy" { jobspec = file("${path.module}/docker-reverse-proxy.hcl") diff --git a/packages/nomad/redis.hcl b/packages/nomad/redis.hcl new file mode 100644 index 000000000..7f1670ac5 --- /dev/null +++ b/packages/nomad/redis.hcl @@ -0,0 +1,64 @@ +variable "gcp_zone" { + type = string + default = "us-central1-a" +} + +variable "image_name" { + type = string + default = "redis:7.4.2-alpine" +} + +variable "redis_port_number" { + type = number + default = 6379 +} + +variable "redis_port_name" { + type = string + default = "redis" +} + +job "redis" { + datacenters = [var.gcp_zone] + node_pool = "api" + priority = 95 + + group "api-service" { + network { + port "redis" { + static = var.redis_port_number + } + } + + service { + name = "redis" + port = var.redis_port_name + + check { + type = "tcp" + name = "health" + interval = "10s" + timeout = "2s" + port = var.redis_port_number + } + } + + task "start" { + driver = "docker" + + resources { + memory_max = 4096 + memory = 2048 + cpu = 1024 + } + + config { + network_mode = "host" + image = var.image_name + ports = [var.redis_port_name] + args = [ + ] + } + } + } +}