Skip to content

Commit

Permalink
debug build: add logic to simulate split-brain
Browse files Browse the repository at this point in the history
* add hk/sigdeb and ais/usr1deb (tags=debug)
* refactor

Signed-off-by: Alex Aizman <[email protected]>
  • Loading branch information
alex-aizman committed Jan 31, 2025
1 parent 45b9169 commit fce0ada
Show file tree
Hide file tree
Showing 16 changed files with 200 additions and 28 deletions.
2 changes: 1 addition & 1 deletion ais/ais_suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ import (

func init() {
xreg.Init()
hk.TestInit()
hk.Init(false)
}

func TestAIS(t *testing.T) {
Expand Down
2 changes: 1 addition & 1 deletion ais/daemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ func initDaemon(version, buildTime string) cos.Runner {
sys.GoEnvMaxprocs()

daemon.rg = &rungroup{rs: make(map[string]cos.Runner, 6)}
hk.Init()
hk.Init(true /*run*/)
daemon.rg.add(hk.HK)

// K8s
Expand Down
1 change: 1 addition & 0 deletions ais/proxy.go
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,7 @@ func (p *proxy) pready(smap *smapX, withRR bool /* also check readiness to rebal
func (p *proxy) Run() error {
config := cmn.GCO.Get()
p.htrun.init(config)
p.setusr1()
p.owner.bmd = newBMDOwnerPrx(config)
p.owner.etl = newEtlMDOwnerPrx(config)

Expand Down
1 change: 1 addition & 0 deletions ais/target.go
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,7 @@ func (t *target) Run() error {
}
config := cmn.GCO.Get()
t.htrun.init(config)
t.setusr1()

tstats := t.statsT.(*stats.Trunner)

Expand Down
107 changes: 107 additions & 0 deletions ais/usr1deb.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
//go:build debug

// Package ais provides core functionality for the AIStore object storage.
/*
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
*/
package ais

import (
"strconv"

"github.com/NVIDIA/aistore/cmn/debug"
"github.com/NVIDIA/aistore/cmn/nlog"
"github.com/NVIDIA/aistore/core/meta"
"github.com/NVIDIA/aistore/hk"
)

func (t *target) setusr1() { hk.SetUSR1(t.usr1) }
func (p *proxy) setusr1() { hk.SetUSR1(p.usr1) }

// When "odd" and "even" nodes cannot coexist together (this code is triggered by
// `SIGUSR1` simultaneously on all nodes in the cluster).
func (h *htrun) usr1() bool /*split done*/ {
smap := h.owner.smap.get()
if smap.CountActivePs() < 2 {
nlog.Errorln("not enough proxies to perform the split:", smap.StringEx())
return false
}
// 1. current primary port
oport, err := strconv.ParseInt(smap.Primary.PubNet.Port, 10, 64)
debug.AssertNoErr(err)

// 2. deterministically select the second primary that satisfies the following condition:
// its port must have a different parity (which is why we cannot use `meta.HrwProxy`)
var (
npsi *meta.Snode
maxH uint64
)
for _, psi := range smap.Pmap {
if flags := psi.Flags.Clear(meta.SnodeIC); flags != 0 {
continue
}
port, err := strconv.ParseInt(psi.PubNet.Port, 10, 64)
debug.AssertNoErr(err)
if port%2 != oport%2 && psi.IDDigest > maxH {
npsi = psi
maxH = psi.IDDigest
}
}
if npsi == nil {
nlog.Errorln(h.String(), "failed to select the second primary candidate that'd have public port with different parity:",
smap.StringEx())
return false
}

// 3. split cluster in two
var (
myport int64
clone = smap.clone()
nodes = []meta.NodeMap{clone.Pmap, clone.Tmap}
)
myport, err = strconv.ParseInt(h.si.PubNet.Port, 10, 64)
debug.AssertNoErr(err)

for _, nmap := range nodes {
for id, si := range nmap {
port, err := strconv.ParseInt(si.PubNet.Port, 10, 64)
debug.AssertNoErr(err)

if myport%2 != port%2 {
delete(nmap, id)
}
}
}

clone.Version += 100 // TODO: randomize version bump to create scenarios
if myport%2 != oport%2 {
debug.Assert(clone.GetNode(npsi.ID()) != nil, clone.StringEx())
clone.Primary = npsi
}
h.owner.smap.put(clone)

return true
}

func (p *proxy) usr1() {
smap := p.owner.smap.get()
opid := smap.Primary.ID()
if !p.htrun.usr1() {
return
}

nsmap := p.owner.smap.get()
nlog.Infoln(p.String(), "split-brain", nsmap.StringEx())

if opid != nsmap.Primary.ID() && p.SID() == nsmap.Primary.ID() {
nlog.Infoln(p.String(), "becoming new primary")
p.becomeNewPrimary("")
}
}

func (t *target) usr1() {
if t.htrun.usr1() {
nsmap := t.owner.smap.get()
nlog.Infoln(t.String(), "split-brain", nsmap.StringEx())
}
}
9 changes: 9 additions & 0 deletions ais/usr1prod.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
//go:build !debug

// Package ais provides core functionality for the AIStore object storage.
/*
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
*/
package ais

func (*htrun) setusr1() {}
2 changes: 1 addition & 1 deletion bench/tools/aisloader/run.go
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,7 @@ func Start(version, buildtime string) (err error) {
// init housekeeper and memsys;
// empty config to use memsys constants;
// alternatively: "memsys": { "min_free": "2gb", ... }
hk.Init()
hk.Init(true /*run*/)
go hk.HK.Run()
hk.WaitStarted()

Expand Down
2 changes: 1 addition & 1 deletion core/core_suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ import (
)

func init() {
hk.TestInit()
hk.Init(false)
}

func TestCore(t *testing.T) {
Expand Down
2 changes: 1 addition & 1 deletion ext/dsort/dsort_suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ import (
)

func init() {
hk.TestInit()
hk.Init(false)
}

func TestDsort(t *testing.T) {
Expand Down
27 changes: 9 additions & 18 deletions hk/housekeeper.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,31 +51,20 @@ type (
}
)

// singleton
var HK *hk

// interface guard
var _ cos.Runner = (*hk)(nil)

func TestInit() {
_init(false)
}

func Init() {
_init(true)
}

func _init(mustRun bool) {
func Init(mustRun bool) {
HK = &hk{
workCh: make(chan op, workChanCap),
sigCh: make(chan os.Signal, 1),
actions: &timedActions{},
}
HK.stopCh.Init()
if mustRun {
HK.running.Store(false)
} else {
HK.running.Store(true) // mustRun == false: tests only
}
HK.running.Store(!mustRun)
heap.Init(HK.actions)
}

Expand Down Expand Up @@ -119,14 +108,16 @@ func (hk *hk) terminate() {

func (*hk) Stop(error) { HK.stopCh.Close() }

func (hk *hk) Run() (err error) {
hk.setSignal() // SIGINT, et al. - see handleSignal() below
func (hk *hk) Run() error {
hk.setSignal() // SIGINT, et al. (see hk.handleSignal)

hk.timer = time.NewTimer(time.Hour)
hk.running.Store(true)
err = hk._run()

err := hk._run()

hk.terminate()
return
return err
}

func (hk *hk) _run() error {
Expand Down
2 changes: 1 addition & 1 deletion hk/housekeeper_suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ import (
)

func TestHousekeeper(t *testing.T) {
hk.TestInit()
hk.Init(false)
go hk.HK.Run()
hk.WaitStarted()
RegisterFailHandler(Fail)
Expand Down
61 changes: 61 additions & 0 deletions hk/sigdeb.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
//go:build debug

// Package hk provides mechanism for registering cleanup
// functions which are invoked at specified intervals.
/*
* Copyright (c) 2018-2025, NVIDIA CORPORATION. All rights reserved.
*/
package hk

import (
"os/signal"
"runtime"
"strings"
"syscall"

"github.com/NVIDIA/aistore/cmn/cos"
"github.com/NVIDIA/aistore/cmn/nlog"
"github.com/NVIDIA/aistore/sys"
)

var cbUSR1 func()

func SetUSR1(cb func()) { cbUSR1 = cb }

func (hk *hk) setSignal() {
signal.Notify(hk.sigCh,
// ignore, log
syscall.SIGHUP, // kill -SIGHUP
// terminate
syscall.SIGINT, // kill -SIGINT (Ctrl-C)
syscall.SIGTERM, // kill -SIGTERM
syscall.SIGQUIT, // kill -SIGQUIT
// test
syscall.SIGUSR1,
)
}

func (hk *hk) handleSignal(s syscall.Signal) (err error) {
switch s {
case syscall.SIGHUP:
var (
sb strings.Builder
mem sys.MemStat
ngr = runtime.NumGoroutine()
)
erm := mem.Get()
mem.Str(&sb)
nfd, erf := numOpenFiles()
nlog.Infoln("ngr [", ngr, sys.NumCPU(), "] mem [", sb.String(), erm, "]", "num-fd [", nfd, erf, "]")
case syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT:
signal.Stop(hk.sigCh)
err = cos.NewSignalError(s)
hk.Stop(err)
case syscall.SIGUSR1:
cbUSR1()
default:
cos.ExitLog("unexpected signal:", s)
}

return err
}
4 changes: 3 additions & 1 deletion hk/sigprod.go
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
//go:build !debug

// Package hk provides mechanism for registering cleanup
// functions which are invoked at specified intervals.
/*
Expand All @@ -18,7 +20,7 @@ import (

func (hk *hk) setSignal() {
signal.Notify(hk.sigCh,
// ignore
// ignore, log
syscall.SIGHUP, // kill -SIGHUP
// terminate
syscall.SIGINT, // kill -SIGINT (Ctrl-C)
Expand Down
2 changes: 1 addition & 1 deletion memsys/memsys_suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ import (
)

func init() {
hk.TestInit()
hk.Init(false)
}

func TestMemsys(t *testing.T) {
Expand Down
2 changes: 1 addition & 1 deletion space/space_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ var gT *testing.T

func TestEvictCleanup(t *testing.T) {
xreg.Init()
hk.TestInit()
hk.Init(false)
cos.InitShortID(0)

RegisterFailHandler(Fail)
Expand Down
2 changes: 1 addition & 1 deletion transport/transport_suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,4 @@ import (
"github.com/NVIDIA/aistore/hk"
)

func init() { hk.TestInit() }
func init() { hk.Init(false) }

0 comments on commit fce0ada

Please sign in to comment.