Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize snapshots storage #234

Draft
wants to merge 13 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions packages/nomad/loki.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,8 @@ job "loki" {

resources {
memory_max = 2048
memory = 1024
cpu = 512
memory = 256
cpu = 256
}

template {
Expand Down
11 changes: 10 additions & 1 deletion packages/nomad/otel-collector.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,15 @@ processors:
- "nomad_client_unallocated_memory"
- "orchestrator.*"
- "api.*"
metricstransform:
transforms:
- include: "nomad_client_host_cpu_idle"
match_type: strict
action: update
operations:
- action: aggregate_labels
aggregation_type: sum
label_set: [instance, node_id, node_status, node_pool]
attributes/session-proxy:
actions:
- key: service.name
Expand Down Expand Up @@ -249,7 +258,7 @@ service:
receivers:
- prometheus
- otlp
processors: [filter, batch]
processors: [filter, batch, metricstransform]
exporters:
- prometheusremotewrite/grafana_cloud_metrics
# metrics/session-proxy:
Expand Down
10 changes: 9 additions & 1 deletion packages/nomad/proxies/client.conf
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ server {
location / {
if ($node_ip = "") {
# If you set any text, the header will be set to `application/octet-stream` and then browser won't be able to render the content
return 404;
return 404; # Invalid sandbox url
}


Expand All @@ -85,6 +85,14 @@ server {
}
}

# Mock for sandbox server when the sandbox is not running, 127.0.0.1 is returned by the DNS resolver
server {
listen 3003;

default_type text/plain;
return 404 'Sandbox is not running or not found.';
}

server {
listen 3001;
location /health {
Expand Down
1 change: 1 addition & 0 deletions packages/orchestrator/internal/sandbox/build/build.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ func (b *File) Slice(off, length int64) ([]byte, error) {
return nil, fmt.Errorf("failed to get mapping: %w", err)
}

// Pass empty huge page when the build id is nil.
if *buildID == uuid.Nil {
return header.EmptyHugePage, nil
}
Expand Down
2 changes: 1 addition & 1 deletion packages/orchestrator/internal/sandbox/build/cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import (
"github.com/e2b-dev/infra/packages/shared/pkg/storage/gcs"
)

const buildExpiration = time.Hour * 25
const buildExpiration = time.Hour * 48

const cachePath = "/orchestrator/build"

Expand Down
36 changes: 29 additions & 7 deletions packages/orchestrator/internal/sandbox/sandbox.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,6 @@ func NewSandbox(
config.KernelVersion,
config.FirecrackerVersion,
config.HugePages,
isSnapshot,
)
if err != nil {
return nil, cleanup, fmt.Errorf("failed to get template snapshot data: %w", err)
Expand Down Expand Up @@ -393,7 +392,13 @@ func (s *Sandbox) Snapshot(
return nil, fmt.Errorf("failed to create memfile diff file: %w", err)
}

err = header.CreateDiff(sourceFile, s.files.MemfilePageSize(), memfileDirtyPages, memfileDiffFile)
memfileDirtyPages, emptyDirtyPages, err := header.CreateDiff(
sourceFile,
s.files.MemfilePageSize(),
memfileDirtyPages,
originalMemfile,
memfileDiffFile,
)
if err != nil {
return nil, fmt.Errorf("failed to create memfile diff: %w", err)
}
Expand All @@ -402,15 +407,32 @@ func (s *Sandbox) Snapshot(

releaseLock()

memfileMapping := header.CreateMapping(
memfileMetadata,
var memfileMappings []*header.BuildMap

memfileEmptyMapping := header.CreateMapping(
&uuid.Nil,
emptyDirtyPages,
memfileMetadata.BlockSize,
)

if memfileEmptyMapping != nil {
memfileMappings = header.MergeMappings(
originalMemfile.Header().Mapping,
memfileEmptyMapping,
)

memfileMappings = header.NormalizeMappings(memfileMappings)
}

memfileDirtyMappings := header.CreateMapping(
&buildId,
memfileDirtyPages,
memfileMetadata.BlockSize,
)

memfileMappings := header.MergeMappings(
memfileMappings = header.MergeMappings(
originalMemfile.Header().Mapping,
memfileMapping,
memfileDirtyMappings,
)

snapfile, err := template.NewLocalFile(snapshotTemplateFiles.CacheSnapfilePath())
Expand Down Expand Up @@ -469,9 +491,9 @@ func (s *Sandbox) Snapshot(
}

rootfsMapping := header.CreateMapping(
rootfsMetadata,
&buildId,
rootfsDirtyBlocks,
rootfsMetadata.BlockSize,
)

rootfsMappings := header.MergeMappings(
Expand Down
5 changes: 1 addition & 4 deletions packages/orchestrator/internal/sandbox/template/cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ import (

// How long to keep the template in the cache since the last access.
// Should be longer than the maximum possible sandbox lifetime.
const templateExpiration = time.Hour * 25
const templateExpiration = time.Hour * 72

type Cache struct {
cache *ttlcache.Cache[string, Template]
Expand Down Expand Up @@ -62,15 +62,13 @@ func (c *Cache) GetTemplate(
kernelVersion,
firecrackerVersion string,
hugePages bool,
isSnapshot bool,
) (Template, error) {
storageTemplate, err := newTemplateFromStorage(
templateId,
buildId,
kernelVersion,
firecrackerVersion,
hugePages,
isSnapshot,
nil,
nil,
c.bucket,
Expand Down Expand Up @@ -125,7 +123,6 @@ func (c *Cache) AddSnapshot(
kernelVersion,
firecrackerVersion,
hugePages,
true,
memfileHeader,
rootfsHeader,
c.bucket,
Expand Down
14 changes: 9 additions & 5 deletions packages/orchestrator/internal/sandbox/template/storage.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package template

import (
"context"
"errors"
"fmt"

"github.com/google/uuid"
Expand All @@ -23,20 +24,23 @@ func NewStorage(
buildId string,
fileType build.DiffType,
blockSize int64,
isSnapshot bool,
h *header.Header,
bucket *gcs.BucketHandle,
) (*Storage, error) {
if isSnapshot && h == nil {
if h == nil {
headerObject := gcs.NewObject(ctx, bucket, buildId+"/"+string(fileType)+storage.HeaderSuffix)

diffHeader, err := header.Deserialize(headerObject)
if err != nil {
if err != nil && !errors.As(gcs.ErrObjectNotExist, err) {
return nil, fmt.Errorf("failed to deserialize header: %w", err)
}

h = diffHeader
} else if h == nil {
if err == nil {
h = diffHeader
}
}

if h == nil {
object := gcs.NewObject(ctx, bucket, buildId+"/"+string(fileType))

size, err := object.Size()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,6 @@ type storageTemplate struct {
rootfs *utils.SetOnce[*Storage]
snapfile *utils.SetOnce[File]

isSnapshot bool

memfileHeader *header.Header
rootfsHeader *header.Header
localSnapfile *LocalFile
Expand All @@ -35,7 +33,6 @@ func newTemplateFromStorage(
kernelVersion,
firecrackerVersion string,
hugePages bool,
isSnapshot bool,
memfileHeader *header.Header,
rootfsHeader *header.Header,
bucket *gcs.BucketHandle,
Expand All @@ -55,7 +52,6 @@ func newTemplateFromStorage(
return &storageTemplate{
files: files,
localSnapfile: localSnapfile,
isSnapshot: isSnapshot,
memfileHeader: memfileHeader,
rootfsHeader: rootfsHeader,
bucket: bucket,
Expand Down Expand Up @@ -111,7 +107,6 @@ func (t *storageTemplate) Fetch(ctx context.Context, buildStore *build.DiffStore
t.files.BuildId,
build.Memfile,
t.files.MemfilePageSize(),
t.isSnapshot,
t.memfileHeader,
t.bucket,
)
Expand All @@ -134,7 +129,6 @@ func (t *storageTemplate) Fetch(ctx context.Context, buildStore *build.DiffStore
t.files.BuildId,
build.Rootfs,
t.files.RootfsBlockSize(),
t.isSnapshot,
t.rootfsHeader,
t.bucket,
)
Expand Down
14 changes: 8 additions & 6 deletions packages/orchestrator/internal/server/sandboxes.go
Original file line number Diff line number Diff line change
Expand Up @@ -163,20 +163,22 @@ func (s *server) Delete(ctx context.Context, in *orchestrator.SandboxDeleteReque
return nil, status.New(codes.NotFound, errMsg.Error()).Err()
}

sbx.Healthcheck(ctx, true)

// Don't allow connecting to the sandbox anymore.
s.dns.Remove(in.SandboxId, sbx.Slot.HostIP())

// Remove the sandbox from the cache to prevent loading it again in API during the time the instance is stopping.
// Old comment:
// Ensure the sandbox is removed from cache.
// Ideally we would rely only on the goroutine defer.
s.sandboxes.Remove(in.SandboxId)

sbx.Healthcheck(ctx, true)

err := sbx.Stop()
if err != nil {
fmt.Fprintf(os.Stderr, "error stopping sandbox '%s': %v\n", in.SandboxId, err)
}

// Ensure the sandbox is removed from cache.
// Ideally we would rely only on the goroutine defer.
s.sandboxes.Remove(in.SandboxId)

return &emptypb.Empty{}, nil
}

Expand Down
27 changes: 16 additions & 11 deletions packages/shared/pkg/dns/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ import (

const ttl = 0

const defaultRoutingIP = "127.0.0.1"

type DNS struct {
mu sync.Mutex
records *smap.Map[string]
Expand Down Expand Up @@ -51,21 +53,24 @@ func (d *DNS) handleDNSRequest(w resolver.ResponseWriter, r *resolver.Msg) {

for _, q := range m.Question {
if q.Qtype == resolver.TypeA {
a := &resolver.A{
Hdr: resolver.RR_Header{
Name: q.Name,
Rrtype: resolver.TypeA,
Class: resolver.ClassINET,
Ttl: ttl,
},
}

sandboxID := strings.Split(q.Name, "-")[0]
ip, found := d.get(sandboxID)
if found {
a := &resolver.A{
Hdr: resolver.RR_Header{
Name: q.Name,
Rrtype: resolver.TypeA,
Class: resolver.ClassINET,
Ttl: ttl,
},
A: net.ParseIP(ip).To4(),
}

m.Answer = append(m.Answer, a)
a.A = net.ParseIP(ip).To4()
} else {
a.A = net.ParseIP(defaultRoutingIP).To4()
}

m.Answer = append(m.Answer, a)
}
}

Expand Down
4 changes: 3 additions & 1 deletion packages/shared/pkg/storage/gcs/object.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ type Object struct {
ctx context.Context
}

var ErrObjectNotExist = storage.ErrObjectNotExist

func NewObject(ctx context.Context, bucket *storage.BucketHandle, objectPath string) *Object {
obj := bucket.Object(objectPath).Retryer(
storage.WithMaxAttempts(maxAttempts),
Expand All @@ -50,7 +52,7 @@ func (o *Object) WriteTo(dst io.Writer) (int64, error) {

reader, err := o.object.NewReader(ctx)
if err != nil {
return 0, fmt.Errorf("failed to create GCS reader: %w", err)
return 0, err
}

defer reader.Close()
Expand Down
Loading