diff --git a/packages/nomad/loki.hcl b/packages/nomad/loki.hcl index 9970eabe3..5eeab44c4 100644 --- a/packages/nomad/loki.hcl +++ b/packages/nomad/loki.hcl @@ -57,8 +57,8 @@ job "loki" { resources { memory_max = 2048 - memory = 1024 - cpu = 512 + memory = 256 + cpu = 256 } template { diff --git a/packages/nomad/otel-collector.hcl b/packages/nomad/otel-collector.hcl index 7c70e82fb..40a460c02 100644 --- a/packages/nomad/otel-collector.hcl +++ b/packages/nomad/otel-collector.hcl @@ -194,6 +194,15 @@ processors: - "nomad_client_unallocated_memory" - "orchestrator.*" - "api.*" + metricstransform: + transforms: + - include: "nomad_client_host_cpu_idle" + match_type: strict + action: update + operations: + - action: aggregate_labels + aggregation_type: sum + label_set: [instance, node_id, node_status, node_pool] attributes/session-proxy: actions: - key: service.name @@ -249,7 +258,7 @@ service: receivers: - prometheus - otlp - processors: [filter, batch] + processors: [filter, batch, metricstransform] exporters: - prometheusremotewrite/grafana_cloud_metrics # metrics/session-proxy: diff --git a/packages/nomad/proxies/client.conf b/packages/nomad/proxies/client.conf index 048ef39f4..3f03ca742 100644 --- a/packages/nomad/proxies/client.conf +++ b/packages/nomad/proxies/client.conf @@ -73,7 +73,7 @@ server { location / { if ($node_ip = "") { # If you set any text, the header will be set to `application/octet-stream` and then browser won't be able to render the content - return 404; + return 404; # Invalid sandbox url } @@ -85,6 +85,14 @@ server { } } +# Mock for sandbox server when the sandbox is not running, 127.0.0.1 is returned by the DNS resolver +server { + listen 3003; + + default_type text/plain; + return 404 'Sandbox is not running or not found.'; +} + server { listen 3001; location /health { diff --git a/packages/orchestrator/internal/sandbox/build/build.go b/packages/orchestrator/internal/sandbox/build/build.go index bb46cff2c..fe3a5c415 100644 --- a/packages/orchestrator/internal/sandbox/build/build.go +++ b/packages/orchestrator/internal/sandbox/build/build.go @@ -102,6 +102,7 @@ func (b *File) Slice(off, length int64) ([]byte, error) { return nil, fmt.Errorf("failed to get mapping: %w", err) } + // Pass empty huge page when the build id is nil. if *buildID == uuid.Nil { return header.EmptyHugePage, nil } diff --git a/packages/orchestrator/internal/sandbox/build/cache.go b/packages/orchestrator/internal/sandbox/build/cache.go index 999aa0969..e5115b396 100644 --- a/packages/orchestrator/internal/sandbox/build/cache.go +++ b/packages/orchestrator/internal/sandbox/build/cache.go @@ -11,7 +11,7 @@ import ( "github.com/e2b-dev/infra/packages/shared/pkg/storage/gcs" ) -const buildExpiration = time.Hour * 25 +const buildExpiration = time.Hour * 48 const cachePath = "/orchestrator/build" diff --git a/packages/orchestrator/internal/sandbox/sandbox.go b/packages/orchestrator/internal/sandbox/sandbox.go index 29ca78edb..17126de41 100644 --- a/packages/orchestrator/internal/sandbox/sandbox.go +++ b/packages/orchestrator/internal/sandbox/sandbox.go @@ -84,7 +84,6 @@ func NewSandbox( config.KernelVersion, config.FirecrackerVersion, config.HugePages, - isSnapshot, ) if err != nil { return nil, cleanup, fmt.Errorf("failed to get template snapshot data: %w", err) @@ -393,7 +392,13 @@ func (s *Sandbox) Snapshot( return nil, fmt.Errorf("failed to create memfile diff file: %w", err) } - err = header.CreateDiff(sourceFile, s.files.MemfilePageSize(), memfileDirtyPages, memfileDiffFile) + memfileDirtyPages, emptyDirtyPages, err := header.CreateDiff( + sourceFile, + s.files.MemfilePageSize(), + memfileDirtyPages, + originalMemfile, + memfileDiffFile, + ) if err != nil { return nil, fmt.Errorf("failed to create memfile diff: %w", err) } @@ -402,15 +407,32 @@ func (s *Sandbox) Snapshot( releaseLock() - memfileMapping := header.CreateMapping( - memfileMetadata, + var memfileMappings []*header.BuildMap + + memfileEmptyMapping := header.CreateMapping( + &uuid.Nil, + emptyDirtyPages, + memfileMetadata.BlockSize, + ) + + if memfileEmptyMapping != nil { + memfileMappings = header.MergeMappings( + originalMemfile.Header().Mapping, + memfileEmptyMapping, + ) + + memfileMappings = header.NormalizeMappings(memfileMappings) + } + + memfileDirtyMappings := header.CreateMapping( &buildId, memfileDirtyPages, + memfileMetadata.BlockSize, ) - memfileMappings := header.MergeMappings( + memfileMappings = header.MergeMappings( originalMemfile.Header().Mapping, - memfileMapping, + memfileDirtyMappings, ) snapfile, err := template.NewLocalFile(snapshotTemplateFiles.CacheSnapfilePath()) @@ -469,9 +491,9 @@ func (s *Sandbox) Snapshot( } rootfsMapping := header.CreateMapping( - rootfsMetadata, &buildId, rootfsDirtyBlocks, + rootfsMetadata.BlockSize, ) rootfsMappings := header.MergeMappings( diff --git a/packages/orchestrator/internal/sandbox/template/cache.go b/packages/orchestrator/internal/sandbox/template/cache.go index 45383138a..991913757 100644 --- a/packages/orchestrator/internal/sandbox/template/cache.go +++ b/packages/orchestrator/internal/sandbox/template/cache.go @@ -14,7 +14,7 @@ import ( // How long to keep the template in the cache since the last access. // Should be longer than the maximum possible sandbox lifetime. -const templateExpiration = time.Hour * 25 +const templateExpiration = time.Hour * 72 type Cache struct { cache *ttlcache.Cache[string, Template] @@ -62,7 +62,6 @@ func (c *Cache) GetTemplate( kernelVersion, firecrackerVersion string, hugePages bool, - isSnapshot bool, ) (Template, error) { storageTemplate, err := newTemplateFromStorage( templateId, @@ -70,7 +69,6 @@ func (c *Cache) GetTemplate( kernelVersion, firecrackerVersion, hugePages, - isSnapshot, nil, nil, c.bucket, @@ -125,7 +123,6 @@ func (c *Cache) AddSnapshot( kernelVersion, firecrackerVersion, hugePages, - true, memfileHeader, rootfsHeader, c.bucket, diff --git a/packages/orchestrator/internal/sandbox/template/storage.go b/packages/orchestrator/internal/sandbox/template/storage.go index 0057c5f96..fc594dae2 100644 --- a/packages/orchestrator/internal/sandbox/template/storage.go +++ b/packages/orchestrator/internal/sandbox/template/storage.go @@ -2,6 +2,7 @@ package template import ( "context" + "errors" "fmt" "github.com/google/uuid" @@ -23,20 +24,23 @@ func NewStorage( buildId string, fileType build.DiffType, blockSize int64, - isSnapshot bool, h *header.Header, bucket *gcs.BucketHandle, ) (*Storage, error) { - if isSnapshot && h == nil { + if h == nil { headerObject := gcs.NewObject(ctx, bucket, buildId+"/"+string(fileType)+storage.HeaderSuffix) diffHeader, err := header.Deserialize(headerObject) - if err != nil { + if err != nil && !errors.As(gcs.ErrObjectNotExist, err) { return nil, fmt.Errorf("failed to deserialize header: %w", err) } - h = diffHeader - } else if h == nil { + if err == nil { + h = diffHeader + } + } + + if h == nil { object := gcs.NewObject(ctx, bucket, buildId+"/"+string(fileType)) size, err := object.Size() diff --git a/packages/orchestrator/internal/sandbox/template/storage_template.go b/packages/orchestrator/internal/sandbox/template/storage_template.go index 62b817151..6dc92b5b3 100644 --- a/packages/orchestrator/internal/sandbox/template/storage_template.go +++ b/packages/orchestrator/internal/sandbox/template/storage_template.go @@ -20,8 +20,6 @@ type storageTemplate struct { rootfs *utils.SetOnce[*Storage] snapfile *utils.SetOnce[File] - isSnapshot bool - memfileHeader *header.Header rootfsHeader *header.Header localSnapfile *LocalFile @@ -35,7 +33,6 @@ func newTemplateFromStorage( kernelVersion, firecrackerVersion string, hugePages bool, - isSnapshot bool, memfileHeader *header.Header, rootfsHeader *header.Header, bucket *gcs.BucketHandle, @@ -55,7 +52,6 @@ func newTemplateFromStorage( return &storageTemplate{ files: files, localSnapfile: localSnapfile, - isSnapshot: isSnapshot, memfileHeader: memfileHeader, rootfsHeader: rootfsHeader, bucket: bucket, @@ -111,7 +107,6 @@ func (t *storageTemplate) Fetch(ctx context.Context, buildStore *build.DiffStore t.files.BuildId, build.Memfile, t.files.MemfilePageSize(), - t.isSnapshot, t.memfileHeader, t.bucket, ) @@ -134,7 +129,6 @@ func (t *storageTemplate) Fetch(ctx context.Context, buildStore *build.DiffStore t.files.BuildId, build.Rootfs, t.files.RootfsBlockSize(), - t.isSnapshot, t.rootfsHeader, t.bucket, ) diff --git a/packages/orchestrator/internal/server/sandboxes.go b/packages/orchestrator/internal/server/sandboxes.go index f61d3029a..df1160e59 100644 --- a/packages/orchestrator/internal/server/sandboxes.go +++ b/packages/orchestrator/internal/server/sandboxes.go @@ -163,20 +163,22 @@ func (s *server) Delete(ctx context.Context, in *orchestrator.SandboxDeleteReque return nil, status.New(codes.NotFound, errMsg.Error()).Err() } - sbx.Healthcheck(ctx, true) - // Don't allow connecting to the sandbox anymore. s.dns.Remove(in.SandboxId, sbx.Slot.HostIP()) + // Remove the sandbox from the cache to prevent loading it again in API during the time the instance is stopping. + // Old comment: + // Ensure the sandbox is removed from cache. + // Ideally we would rely only on the goroutine defer. + s.sandboxes.Remove(in.SandboxId) + + sbx.Healthcheck(ctx, true) + err := sbx.Stop() if err != nil { fmt.Fprintf(os.Stderr, "error stopping sandbox '%s': %v\n", in.SandboxId, err) } - // Ensure the sandbox is removed from cache. - // Ideally we would rely only on the goroutine defer. - s.sandboxes.Remove(in.SandboxId) - return &emptypb.Empty{}, nil } diff --git a/packages/shared/pkg/dns/server.go b/packages/shared/pkg/dns/server.go index 8f065d291..98f2b6306 100644 --- a/packages/shared/pkg/dns/server.go +++ b/packages/shared/pkg/dns/server.go @@ -14,6 +14,8 @@ import ( const ttl = 0 +const defaultRoutingIP = "127.0.0.1" + type DNS struct { mu sync.Mutex records *smap.Map[string] @@ -51,21 +53,24 @@ func (d *DNS) handleDNSRequest(w resolver.ResponseWriter, r *resolver.Msg) { for _, q := range m.Question { if q.Qtype == resolver.TypeA { + a := &resolver.A{ + Hdr: resolver.RR_Header{ + Name: q.Name, + Rrtype: resolver.TypeA, + Class: resolver.ClassINET, + Ttl: ttl, + }, + } + sandboxID := strings.Split(q.Name, "-")[0] ip, found := d.get(sandboxID) if found { - a := &resolver.A{ - Hdr: resolver.RR_Header{ - Name: q.Name, - Rrtype: resolver.TypeA, - Class: resolver.ClassINET, - Ttl: ttl, - }, - A: net.ParseIP(ip).To4(), - } - - m.Answer = append(m.Answer, a) + a.A = net.ParseIP(ip).To4() + } else { + a.A = net.ParseIP(defaultRoutingIP).To4() } + + m.Answer = append(m.Answer, a) } } diff --git a/packages/shared/pkg/storage/gcs/object.go b/packages/shared/pkg/storage/gcs/object.go index ad9062918..3578d8de7 100644 --- a/packages/shared/pkg/storage/gcs/object.go +++ b/packages/shared/pkg/storage/gcs/object.go @@ -27,6 +27,8 @@ type Object struct { ctx context.Context } +var ErrObjectNotExist = storage.ErrObjectNotExist + func NewObject(ctx context.Context, bucket *storage.BucketHandle, objectPath string) *Object { obj := bucket.Object(objectPath).Retryer( storage.WithMaxAttempts(maxAttempts), @@ -50,7 +52,7 @@ func (o *Object) WriteTo(dst io.Writer) (int64, error) { reader, err := o.object.NewReader(ctx) if err != nil { - return 0, fmt.Errorf("failed to create GCS reader: %w", err) + return 0, err } defer reader.Close() diff --git a/packages/shared/pkg/storage/header/diff.go b/packages/shared/pkg/storage/header/diff.go index 38d61fb95..272b7565a 100644 --- a/packages/shared/pkg/storage/header/diff.go +++ b/packages/shared/pkg/storage/header/diff.go @@ -1,6 +1,7 @@ package header import ( + "bytes" "fmt" "io" @@ -18,20 +19,61 @@ var ( EmptyBlock = make([]byte, RootfsBlockSize) ) -func CreateDiff(source io.ReaderAt, blockSize int64, dirty *bitset.BitSet, diff io.Writer) error { +type Slicer interface { + Slice(off, length int64) ([]byte, error) +} + +func CreateDiff( + source io.ReaderAt, + blockSize int64, + dirty *bitset.BitSet, + base Slicer, + diff io.Writer, +) (*bitset.BitSet, *bitset.BitSet, error) { b := make([]byte, blockSize) + emptyBuf := EmptyBlock + if blockSize == HugepageSize { + emptyBuf = EmptyHugePage + } + + empty := bitset.New(0) + for i, e := dirty.NextSet(0); e; i, e = dirty.NextSet(i + 1) { _, err := source.ReadAt(b, int64(i)*blockSize) if err != nil { - return fmt.Errorf("error reading from source: %w", err) + return nil, nil, fmt.Errorf("error reading from source: %w", err) + } + + if base != nil { + // At this moment the template should be cached locally, because it was used when starting or during running the sandbox—that's why it is dirty. + cacheBlock, err := base.Slice(int64(i)*blockSize, blockSize) + if err != nil { + return nil, nil, fmt.Errorf("error reading from cache: %w", err) + } + + // If the block is the same as in the base it is not dirty. + if bytes.Equal(b, cacheBlock) { + dirty.Clear(uint(i)) + + continue + } + } + + // If the block is empty, we don't need to write it to the diff. + // Because we checked it does not equal to the base, so we keep it separately. + if bytes.Equal(b, emptyBuf) { + dirty.Clear(uint(i)) + empty.Set(uint(i)) + + continue } _, err = diff.Write(b) if err != nil { - return fmt.Errorf("error writing to diff: %w", err) + return nil, nil, fmt.Errorf("error writing to diff: %w", err) } } - return nil + return dirty, empty, nil } diff --git a/packages/shared/pkg/storage/header/mapping.go b/packages/shared/pkg/storage/header/mapping.go index 81f78ccc1..d645c9398 100644 --- a/packages/shared/pkg/storage/header/mapping.go +++ b/packages/shared/pkg/storage/header/mapping.go @@ -21,9 +21,9 @@ type BuildMap struct { } func CreateMapping( - metadata *Metadata, buildId *uuid.UUID, dirty *bitset.BitSet, + blockSize uint64, ) []*BuildMap { var mappings []*BuildMap @@ -40,9 +40,9 @@ func CreateMapping( if blockLength > 0 { m := &BuildMap{ - Offset: uint64(int64(startBlock) * int64(metadata.BlockSize)), + Offset: uint64(int64(startBlock) * int64(blockSize)), BuildId: *buildId, - Length: uint64(blockLength) * uint64(metadata.BlockSize), + Length: uint64(blockLength) * uint64(blockSize), BuildStorageOffset: buildStorageOffset, } @@ -57,9 +57,9 @@ func CreateMapping( if blockLength > 0 { mappings = append(mappings, &BuildMap{ - Offset: uint64(startBlock) * metadata.BlockSize, + Offset: uint64(startBlock) * blockSize, BuildId: *buildId, - Length: uint64(blockLength) * uint64(metadata.BlockSize), + Length: uint64(blockLength) * blockSize, BuildStorageOffset: buildStorageOffset, }) } @@ -236,6 +236,18 @@ func MergeMappings( return mappings } +// Join adjanced mappings that have the same buildId. +func NormalizeMappings(mappings []*BuildMap) []*BuildMap { + for i := 0; i < len(mappings); i++ { + if i+1 < len(mappings) && mappings[i].BuildId == mappings[i+1].BuildId { + mappings[i].Length += mappings[i+1].Length + mappings = append(mappings[:i+1], mappings[i+2:]...) + } + } + + return mappings +} + // Format returns a string representation of the mapping as: // // startBlock-endBlock [offset, offset+length) := [buildStorageOffset, buildStorageOffset+length) ⊂ buildId, length in bytes diff --git a/packages/shared/pkg/storage/template.go b/packages/shared/pkg/storage/template.go index 64afadab0..19a36f6ae 100644 --- a/packages/shared/pkg/storage/template.go +++ b/packages/shared/pkg/storage/template.go @@ -123,6 +123,14 @@ func (t *TemplateFiles) BuildRootfsPath() string { return filepath.Join(t.BuildDir(), RootfsName) } +func (t *TemplateFiles) BuildMemfileDiffPath() string { + return filepath.Join(t.BuildDir(), fmt.Sprintf("%s.diff", MemfileName)) +} + +func (t *TemplateFiles) BuildRootfsDiffPath() string { + return filepath.Join(t.BuildDir(), fmt.Sprintf("%s.diff", RootfsName)) +} + func (t *TemplateFiles) BuildSnapfilePath() string { return filepath.Join(t.BuildDir(), SnapfileName) } diff --git a/packages/template-manager/internal/server/create_template.go b/packages/template-manager/internal/server/create_template.go index 4092d6a37..36a9ba88b 100644 --- a/packages/template-manager/internal/server/create_template.go +++ b/packages/template-manager/internal/server/create_template.go @@ -3,16 +3,20 @@ package server import ( "context" "fmt" + "os" "os/exec" "strconv" "strings" "time" + "github.com/bits-and-blooms/bitset" + "github.com/google/uuid" "go.opentelemetry.io/otel/attribute" "google.golang.org/grpc/metadata" template_manager "github.com/e2b-dev/infra/packages/shared/pkg/grpc/template-manager" "github.com/e2b-dev/infra/packages/shared/pkg/storage" + "github.com/e2b-dev/infra/packages/shared/pkg/storage/header" "github.com/e2b-dev/infra/packages/shared/pkg/telemetry" "github.com/e2b-dev/infra/packages/template-manager/internal/build" "github.com/e2b-dev/infra/packages/template-manager/internal/build/writer" @@ -59,7 +63,7 @@ func (s *serverStore) TemplateCreate(templateRequest *template_manager.TemplateC var err error - // Remove local template files if build fails + // Remove local template files after build ends. defer func() { removeCtx, cancel := context.WithTimeout(context.Background(), cleanupTimeout) defer cancel() @@ -92,14 +96,139 @@ func (s *serverStore) TemplateCreate(templateRequest *template_manager.TemplateC } }() + buildID, err := uuid.Parse(config.BuildID) + if err != nil { + return fmt.Errorf("error parsing build id: %w", err) + } + + // MEMFILE memfilePath := template.BuildMemfilePath() + memfileDiffPath := template.BuildMemfileDiffPath() + + memfileSource, err := os.Open(memfilePath) + if err != nil { + return fmt.Errorf("error opening memfile source: %w", err) + } + + memfileInfo, err := memfileSource.Stat() + if err != nil { + return fmt.Errorf("error getting memfile size: %w", err) + } + + memfileDiffFile, err := os.Create(memfileDiffPath) + if err != nil { + return fmt.Errorf("error creating memfile diff file: %w", err) + } + + memfileDirtyPages := bitset.New(0) + memfileDirtyPages.FlipRange(0, uint(header.TotalBlocks(memfileInfo.Size(), template.MemfilePageSize()))) + + memfileDirtyPages, emptyDirtyPages, err := header.CreateDiff( + memfileSource, + template.MemfilePageSize(), + memfileDirtyPages, + nil, + memfileDiffFile, + ) + + memfileDirtyMappings := header.CreateMapping( + &buildID, + memfileDirtyPages, + uint64(template.MemfilePageSize()), + ) + + memfileEmptyMappings := header.CreateMapping( + &uuid.Nil, + emptyDirtyPages, + uint64(template.MemfilePageSize()), + ) + + memfileMappings := header.MergeMappings(memfileDirtyMappings, memfileEmptyMappings) + + memfileMetadata := &header.Metadata{ + Version: 1, + Generation: 0, + BlockSize: uint64(template.MemfilePageSize()), + Size: uint64(memfileInfo.Size()), + BuildId: buildID, + BaseBuildId: buildID, + } + + memfileHeader := header.NewHeader( + memfileMetadata, + memfileMappings, + ) + + // ROOTFS rootfsPath := template.BuildRootfsPath() + rootfsDiffPath := template.BuildRootfsDiffPath() + + rootfsSource, err := os.Open(rootfsPath) + if err != nil { + return fmt.Errorf("error opening rootfs source: %w", err) + } + + rootfsInfo, err := rootfsSource.Stat() + if err != nil { + return fmt.Errorf("error getting rootfs size: %w", err) + } + + rootfsDiffFile, err := os.Create(rootfsDiffPath) + if err != nil { + return fmt.Errorf("error creating rootfs diff file: %w", err) + } + + rootfsDirtyBlocks := bitset.New(0) + rootfsDirtyBlocks.FlipRange(0, uint(header.TotalBlocks(rootfsInfo.Size(), template.RootfsBlockSize()))) + + rootfsDirtyBlocks, emptyDirtyBlocks, err := header.CreateDiff( + rootfsSource, + template.RootfsBlockSize(), + rootfsDirtyBlocks, + nil, + rootfsDiffFile, + ) + + rootfsDirtyMappings := header.CreateMapping( + &buildID, + rootfsDirtyBlocks, + uint64(template.RootfsBlockSize()), + ) + + rootfsEmptyMappings := header.CreateMapping( + &uuid.Nil, + emptyDirtyBlocks, + uint64(template.RootfsBlockSize()), + ) + + rootfsMappings := header.MergeMappings(rootfsDirtyMappings, rootfsEmptyMappings) + + rootfsMetadata := &header.Metadata{ + Version: 1, + Generation: 0, + BlockSize: uint64(template.RootfsBlockSize()), + Size: uint64(rootfsInfo.Size()), + BuildId: buildID, + BaseBuildId: buildID, + } + + rootfsHeader := header.NewHeader( + rootfsMetadata, + rootfsMappings, + ) + + // UPLOAD + b := storage.NewTemplateBuild( + memfileHeader, + rootfsHeader, + template.TemplateFiles, + ) - upload := buildStorage.Upload( + upload := b.Upload( childCtx, template.BuildSnapfilePath(), - &memfilePath, - &rootfsPath, + &memfileDiffPath, + &rootfsDiffPath, ) cmd := exec.Command(storage.HostEnvdPath, "-version")