diff --git a/link.go b/link.go index a599087..db0f382 100644 --- a/link.go +++ b/link.go @@ -33,6 +33,7 @@ const ( CgroupLegacy Netns Iter + StructOps ) // @@ -47,6 +48,7 @@ type bpfLinkLegacy struct { type BPFLink struct { link *C.struct_bpf_link prog *BPFProg + m *BPFMap linkType LinkType eventName string legacy *bpfLinkLegacy // if set, this is a fake BPFLink diff --git a/map.go b/map.go index 809f0e5..a9aeee8 100644 --- a/map.go +++ b/map.go @@ -73,6 +73,19 @@ func (m *BPFMap) ReuseFD(fd int) error { return nil } +func (m *BPFMap) AttachStructOps() (*BPFLink, error) { + linkC, errno := C.bpf_map__attach_struct_ops(m.bpfMap) + if linkC == nil { + return nil, fmt.Errorf("failed to attach struct_ops: %w", errno) + } + return &BPFLink{ + link: linkC, + m: m, + linkType: StructOps, + eventName: fmt.Sprintf("structOps-%s", m.Name()), + }, nil +} + func (m *BPFMap) Name() string { return C.GoString(C.bpf_map__name(m.bpfMap)) } diff --git a/selftest/common/vmlinux.h b/selftest/common/vmlinux.h index a2e4d74..1ca559f 100644 --- a/selftest/common/vmlinux.h +++ b/selftest/common/vmlinux.h @@ -4,6 +4,11 @@ ; // don't remove: clangd parsing bug https://github.com/clangd/clangd/issues/1167 #pragma clang attribute push(__attribute__((preserve_access_index)), apply_to = record) +enum { + false = 0, + true = 1, +}; + typedef signed char __s8; typedef __s8 s8; typedef s8 int8_t; @@ -141,10 +146,438 @@ struct bpf_iter__task { struct seq_file; +struct rb_node { + long unsigned int __rb_parent_color; + struct rb_node *rb_right; + struct rb_node *rb_left; +}; + +struct bpf_iter_scx_dsq { + u64 __opaque[6]; +}; + +struct list_head { + struct list_head *next; + struct list_head *prev; +}; + +struct scx_dsq_list_node { + struct list_head node; + u32 flags; + u32 priv; +}; + +struct scx_dispatch_q; + +struct bpf_iter_scx_dsq_kern { + struct scx_dsq_list_node cursor; + struct scx_dispatch_q *dsq; + u64 slice; + u64 vtime; +}; + +typedef struct { + s64 counter; +} atomic64_t; + +typedef atomic64_t atomic_long_t; + +typedef struct { + int counter; +} atomic_t; + +enum scx_consts { + SCX_DSP_DFL_MAX_BATCH = 32, + SCX_DSP_MAX_LOOPS = 32, + SCX_WATCHDOG_MAX_TIMEOUT = 7500, + SCX_EXIT_BT_LEN = 64, + SCX_EXIT_MSG_LEN = 1024, + SCX_EXIT_DUMP_DFL_LEN = 32768, + SCX_CPUPERF_ONE = 1024, + SCX_OPS_TASK_ITER_BATCH = 32, +}; + +enum scx_cpu_preempt_reason { + SCX_CPU_PREEMPT_RT = 0, + SCX_CPU_PREEMPT_DL = 1, + SCX_CPU_PREEMPT_STOP = 2, + SCX_CPU_PREEMPT_UNKNOWN = 3, +}; + +enum scx_deq_flags { + SCX_DEQ_SLEEP = 1ULL, + SCX_DEQ_CORE_SCHED_EXEC = 4294967296ULL, +}; + +enum scx_dsq_id_flags { + SCX_DSQ_FLAG_BUILTIN = 9223372036854775808ULL, + SCX_DSQ_FLAG_LOCAL_ON = 4611686018427387904ULL, + SCX_DSQ_INVALID = 9223372036854775808ULL, + SCX_DSQ_GLOBAL = 9223372036854775809ULL, + SCX_DSQ_LOCAL = 9223372036854775810ULL, + SCX_DSQ_LOCAL_ON = 13835058055282163712ULL, + SCX_DSQ_LOCAL_CPU_MASK = 4294967295ULL, +}; + +enum scx_dsq_iter_flags { + SCX_DSQ_ITER_REV = 65536, + __SCX_DSQ_ITER_HAS_SLICE = 1073741824, + __SCX_DSQ_ITER_HAS_VTIME = 2147483648, + __SCX_DSQ_ITER_USER_FLAGS = 65536, + __SCX_DSQ_ITER_ALL_FLAGS = 3221291008, +}; + +enum scx_dsq_lnode_flags { + SCX_DSQ_LNODE_ITER_CURSOR = 1, + __SCX_DSQ_LNODE_PRIV_SHIFT = 16, +}; + +enum scx_enq_flags { + SCX_ENQ_WAKEUP = 1ULL, + SCX_ENQ_HEAD = 16ULL, + SCX_ENQ_CPU_SELECTED = 1024ULL, + SCX_ENQ_PREEMPT = 4294967296ULL, + SCX_ENQ_REENQ = 1099511627776ULL, + SCX_ENQ_LAST = 2199023255552ULL, + __SCX_ENQ_INTERNAL_MASK = 18374686479671623680ULL, + SCX_ENQ_CLEAR_OPSS = 72057594037927936ULL, + SCX_ENQ_DSQ_PRIQ = 144115188075855872ULL, +}; + +enum scx_ent_dsq_flags { + SCX_TASK_DSQ_ON_PRIQ = 1, +}; + +enum scx_ent_flags { + SCX_TASK_QUEUED = 1, + SCX_TASK_RESET_RUNNABLE_AT = 4, + SCX_TASK_DEQD_FOR_SLEEP = 8, + SCX_TASK_STATE_SHIFT = 8, + SCX_TASK_STATE_BITS = 2, + SCX_TASK_STATE_MASK = 768, + SCX_TASK_CURSOR = -2147483648, +}; + +enum scx_exit_code { + SCX_ECODE_RSN_HOTPLUG = 4294967296ULL, + SCX_ECODE_ACT_RESTART = 281474976710656ULL, +}; + +enum scx_exit_kind { + SCX_EXIT_NONE = 0, + SCX_EXIT_DONE = 1, + SCX_EXIT_UNREG = 64, + SCX_EXIT_UNREG_BPF = 65, + SCX_EXIT_UNREG_KERN = 66, + SCX_EXIT_SYSRQ = 67, + SCX_EXIT_ERROR = 1024, + SCX_EXIT_ERROR_BPF = 1025, + SCX_EXIT_ERROR_STALL = 1026, +}; + +enum scx_kf_mask { + SCX_KF_UNLOCKED = 0, + SCX_KF_CPU_RELEASE = 1, + SCX_KF_DISPATCH = 2, + SCX_KF_ENQUEUE = 4, + SCX_KF_SELECT_CPU = 8, + SCX_KF_REST = 16, + __SCX_KF_RQ_LOCKED = 31, + __SCX_KF_TERMINAL = 28, +}; + +enum scx_kick_flags { + SCX_KICK_IDLE = 1, + SCX_KICK_PREEMPT = 2, + SCX_KICK_WAIT = 4, +}; + +enum scx_opi { + SCX_OPI_BEGIN = 0, + SCX_OPI_NORMAL_BEGIN = 0, + SCX_OPI_NORMAL_END = 29, + SCX_OPI_CPU_HOTPLUG_BEGIN = 29, + SCX_OPI_CPU_HOTPLUG_END = 31, + SCX_OPI_END = 31, +}; + +enum scx_ops_enable_state { + SCX_OPS_ENABLING = 0, + SCX_OPS_ENABLED = 1, + SCX_OPS_DISABLING = 2, + SCX_OPS_DISABLED = 3, +}; + +enum scx_ops_flags { + SCX_OPS_KEEP_BUILTIN_IDLE = 1, + SCX_OPS_ENQ_LAST = 2, + SCX_OPS_ENQ_EXITING = 4, + SCX_OPS_SWITCH_PARTIAL = 8, + SCX_OPS_HAS_CGROUP_WEIGHT = 65536, + SCX_OPS_ALL_FLAGS = 65551, +}; + +enum scx_ops_state { + SCX_OPSS_NONE = 0, + SCX_OPSS_QUEUEING = 1, + SCX_OPSS_QUEUED = 2, + SCX_OPSS_DISPATCHING = 3, + SCX_OPSS_QSEQ_SHIFT = 2, +}; + +enum scx_pick_idle_cpu_flags { + SCX_PICK_IDLE_CORE = 1, +}; + +enum scx_public_consts { + SCX_OPS_NAME_LEN = 128ULL, + SCX_SLICE_DFL = 20000000ULL, + SCX_SLICE_INF = 18446744073709551615ULL, +}; + +enum scx_rq_flags { + SCX_RQ_ONLINE = 1, + SCX_RQ_CAN_STOP_TICK = 2, + SCX_RQ_BAL_PENDING = 4, + SCX_RQ_BAL_KEEP = 8, + SCX_RQ_BYPASSING = 16, + SCX_RQ_IN_WAKEUP = 65536, + SCX_RQ_IN_BALANCE = 131072, +}; + +enum scx_task_state { + SCX_TASK_NONE = 0, + SCX_TASK_INIT = 1, + SCX_TASK_READY = 2, + SCX_TASK_ENABLED = 3, + SCX_TASK_NR_STATES = 4, +}; + +enum scx_tg_flags { + SCX_TG_ONLINE = 1, + SCX_TG_INITED = 2, +}; + +enum scx_wake_flags { + SCX_WAKE_FORK = 4, + SCX_WAKE_TTWU = 8, + SCX_WAKE_SYNC = 16, +}; + +struct sched_ext_entity { + struct scx_dispatch_q *dsq; + struct scx_dsq_list_node dsq_list; + struct rb_node dsq_priq; + u32 dsq_seq; + u32 dsq_flags; + u32 flags; + u32 weight; + s32 sticky_cpu; + s32 holding_cpu; + u32 kf_mask; + struct task_struct *kf_tasks[2]; + atomic_long_t ops_state; + struct list_head runnable_node; + long unsigned int runnable_at; + u64 core_sched_at; + u64 ddsp_dsq_id; + u64 ddsp_enq_flags; + u64 slice; + u64 dsq_vtime; + bool disallow; + struct cgroup *cgrp_moving_from; + struct list_head tasks_node; +}; + +struct scx_cpu_acquire_args; + +struct scx_cpu_release_args; + +struct scx_init_task_args; + +struct scx_exit_task_args; + +struct scx_dump_ctx; + +struct scx_cgroup_init_args; + +struct scx_exit_info; + +struct scx_bstr_buf { + u64 data[12]; + char line[1024]; +}; + +struct scx_cgroup_init_args { + u32 weight; +}; + +struct scx_cpu_acquire_args { +}; + +struct scx_cpu_release_args { + enum scx_cpu_preempt_reason reason; + struct task_struct *task; +}; + +struct scx_dsp_buf_ent { + struct task_struct *task; + long unsigned int qseq; + u64 dsq_id; + u64 enq_flags; +}; + +struct scx_dsp_ctx { + struct rq *rq; + u32 cursor; + u32 nr_tasks; + struct scx_dsp_buf_ent buf[0]; +}; + +struct scx_dump_ctx { + enum scx_exit_kind kind; + s64 exit_code; + const char *reason; + u64 at_ns; + u64 at_jiffies; +}; + +struct scx_dump_data { + s32 cpu; + bool first; + s32 cursor; + struct seq_buf *s; + const char *prefix; + struct scx_bstr_buf buf; +}; + +struct scx_exit_info { + enum scx_exit_kind kind; + s64 exit_code; + const char *reason; + long unsigned int *bt; + u32 bt_len; + char *msg; + char *dump; +}; + +struct scx_exit_task_args { + bool cancelled; +}; + +struct scx_init_task_args { + bool fork; + struct cgroup *cgroup; +}; + +struct pin_cookie { +}; + +struct rq_flags { + long unsigned int flags; + struct pin_cookie cookie; + unsigned int clock_update_flags; +}; + +struct scx_task_iter { + struct sched_ext_entity cursor; + struct task_struct *locked; + struct rq *rq; + struct rq_flags rf; + u32 cnt; +}; + +struct cpumask { + long unsigned int bits[128]; +}; + +typedef struct cpumask cpumask_t; + +struct bpf_iter_bits { + __u64 __opaque[2]; +}; + +struct sched_ext_ops { + s32 (*select_cpu)(struct task_struct *, s32, u64); + void (*enqueue)(struct task_struct *, u64); + void (*dequeue)(struct task_struct *, u64); + void (*dispatch)(s32, struct task_struct *); + void (*tick)(struct task_struct *); + void (*runnable)(struct task_struct *, u64); + void (*running)(struct task_struct *); + void (*stopping)(struct task_struct *, bool); + void (*quiescent)(struct task_struct *, u64); + bool (*yield)(struct task_struct *, struct task_struct *); + bool (*core_sched_before)(struct task_struct *, struct task_struct *); + void (*set_weight)(struct task_struct *, u32); + void (*set_cpumask)(struct task_struct *, const struct cpumask *); + void (*update_idle)(s32, bool); + void (*cpu_acquire)(s32, struct scx_cpu_acquire_args *); + void (*cpu_release)(s32, struct scx_cpu_release_args *); + s32 (*init_task)(struct task_struct *, struct scx_init_task_args *); + void (*exit_task)(struct task_struct *, struct scx_exit_task_args *); + void (*enable)(struct task_struct *); + void (*disable)(struct task_struct *); + void (*dump)(struct scx_dump_ctx *); + void (*dump_cpu)(struct scx_dump_ctx *, s32, bool); + void (*dump_task)(struct scx_dump_ctx *, struct task_struct *); + s32 (*cgroup_init)(struct cgroup *, struct scx_cgroup_init_args *); + void (*cgroup_exit)(struct cgroup *); + s32 (*cgroup_prep_move)(struct task_struct *, struct cgroup *, struct cgroup *); + void (*cgroup_move)(struct task_struct *, struct cgroup *, struct cgroup *); + void (*cgroup_cancel_move)(struct task_struct *, struct cgroup *, struct cgroup *); + void (*cgroup_set_weight)(struct cgroup *, u32); + void (*cpu_online)(s32); + void (*cpu_offline)(s32); + s32 (*init)(void); + void (*exit)(struct scx_exit_info *); + u32 dispatch_max_batch; + u64 flags; + u32 timeout_ms; + u32 exit_dump_len; + u64 hotplug_seq; + char name[128]; +}; + +enum bpf_struct_ops_state { + BPF_STRUCT_OPS_STATE_INIT = 0, + BPF_STRUCT_OPS_STATE_INUSE = 1, + BPF_STRUCT_OPS_STATE_TOBEFREE = 2, + BPF_STRUCT_OPS_STATE_READY = 3, +}; + +struct refcount_struct { + atomic_t refs; +}; + +typedef struct refcount_struct refcount_t; + +struct bpf_struct_ops_common_value { + refcount_t refcnt; + enum bpf_struct_ops_state state; +}; + +struct bpf_struct_ops_sched_ext_ops { + struct bpf_struct_ops_common_value common; + long : 64; + long : 64; + long : 64; + long : 64; + long : 64; + long : 64; + long : 64; + struct sched_ext_ops data; + long : 64; + long : 64; + long : 64; +}; + struct task_struct { pid_t pid; pid_t tgid; struct task_struct *parent; + struct sched_ext_entity scx; + short unsigned int migration_disabled; char comm[16]; }; diff --git a/selftest/struct-ops/.gitignore b/selftest/struct-ops/.gitignore new file mode 100644 index 0000000..b421ce7 --- /dev/null +++ b/selftest/struct-ops/.gitignore @@ -0,0 +1 @@ +scx \ No newline at end of file diff --git a/selftest/struct-ops/Makefile b/selftest/struct-ops/Makefile new file mode 100644 index 0000000..b15a48c --- /dev/null +++ b/selftest/struct-ops/Makefile @@ -0,0 +1,100 @@ +BASEDIR = $(abspath ../../) + +OUTPUT = ../../output + +LIBBPF_SRC = $(abspath ../../libbpf/src) +LIBBPF_OBJ = $(abspath $(OUTPUT)/libbpf.a) + +CLANG = clang +CC = $(CLANG) +GO = go +PKGCONFIG = pkg-config + +ARCH := $(shell uname -m | sed 's/x86_64/amd64/g; s/aarch64/arm64/g') + +# libbpf + +LIBBPF_OBJDIR = $(abspath ./$(OUTPUT)/libbpf) + +CFLAGS = -g -O2 -Wall -fpie -I$(abspath ../common) +LDFLAGS = + +CGO_CFLAGS_STATIC = "-I$(abspath $(OUTPUT)) -I$(abspath ../common)" +CGO_LDFLAGS_STATIC = "$(shell PKG_CONFIG_PATH=$(LIBBPF_OBJDIR) $(PKGCONFIG) --static --libs libbpf)" +CGO_EXTLDFLAGS_STATIC = '-w -extldflags "-static"' +SCX_FLAGS=-mcpu=v3 -mlittle-endian \ +-I ../../libbpf/src/usr/include -I ../../libbpf/include/uapi \ +-I scx/scheds/include/scx -I scx/scheds/include/bpf-compat + +CGO_CFLAGS_DYN = "-I. -I/usr/include/" +CGO_LDFLAGS_DYN = "$(shell $(PKGCONFIG) --shared --libs libbpf)" + +MAIN = main + +.PHONY: $(MAIN) +.PHONY: $(MAIN).go +.PHONY: $(MAIN).bpf.c + +all: $(MAIN)-static + +.PHONY: libbpfgo +.PHONY: libbpfgo-static +.PHONY: libbpfgo-dynamic + +## libbpfgo + +libbpfgo-static: + $(MAKE) -C $(BASEDIR) libbpfgo-static + +libbpfgo-dynamic: + $(MAKE) -C $(BASEDIR) libbpfgo-dynamic + +outputdir: + $(MAKE) -C $(BASEDIR) outputdir + +## test bpf dependency + +$(MAIN).bpf.o: $(MAIN).bpf.c + @if [ ! -d "scx" ]; then \ + git clone -b v1.0.9 https://github.com/sched-ext/scx.git; \ + else \ + echo "scx directory already exists, skipping clone"; \ + fi + $(CLANG) $(CFLAGS) -target bpf -D__TARGET_ARCH_$(ARCH) $(SCX_FLAGS) -I$(OUTPUT) -I$(abspath ../common) -c $< -o $@ + +## test + +.PHONY: $(MAIN)-static +.PHONY: $(MAIN)-dynamic + +$(MAIN)-static: libbpfgo-static | $(MAIN).bpf.o + CC=$(CLANG) \ + CGO_CFLAGS=$(CGO_CFLAGS_STATIC) \ + CGO_LDFLAGS=$(CGO_LDFLAGS_STATIC) \ + GOOS=linux GOARCH=$(ARCH) \ + $(GO) build \ + -tags netgo -ldflags $(CGO_EXTLDFLAGS_STATIC) \ + -o $(MAIN)-static ./$(MAIN).go + +$(MAIN)-dynamic: libbpfgo-dynamic | $(MAIN).bpf.o + CC=$(CLANG) \ + CGO_CFLAGS=$(CGO_CFLAGS_DYN) \ + CGO_LDFLAGS=$(CGO_LDFLAGS_DYN) \ + $(GO) build -o ./$(MAIN)-dynamic ./$(MAIN).go + +## run + +.PHONY: run +.PHONY: run-static +.PHONY: run-dynamic + +run: run-static + +run-static: $(MAIN)-static + sudo ./run.sh $(MAIN)-static + +run-dynamic: $(MAIN)-dynamic + sudo ./run.sh $(MAIN)-dynamic + +clean: + rm -f *.o *-static *-dynamic diff --git a/selftest/struct-ops/go.mod b/selftest/struct-ops/go.mod new file mode 100644 index 0000000..d24b6b4 --- /dev/null +++ b/selftest/struct-ops/go.mod @@ -0,0 +1,7 @@ +module github.com/aquasecurity/libbpfgo/selftest/struct-ops + +go 1.21 + +require github.com/aquasecurity/libbpfgo v0.0.0 + +replace github.com/aquasecurity/libbpfgo => ../../ diff --git a/selftest/struct-ops/go.sum b/selftest/struct-ops/go.sum new file mode 100644 index 0000000..5496456 --- /dev/null +++ b/selftest/struct-ops/go.sum @@ -0,0 +1,8 @@ +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= +github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/selftest/struct-ops/main.bpf.c b/selftest/struct-ops/main.bpf.c new file mode 100644 index 0000000..90a5bc7 --- /dev/null +++ b/selftest/struct-ops/main.bpf.c @@ -0,0 +1,150 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A simple scheduler. + * + * By default, it operates as a simple global weighted vtime scheduler and can + * be switched to FIFO scheduling. It also demonstrates the following niceties. + * + * - Statistics tracking how many tasks are queued to local and global dsq's. + * - Termination notification for userspace. + * + * While very simple, this scheduler should work reasonably well on CPUs with a + * uniform L3 cache topology. While preemption is not implemented, the fact that + * the scheduling queue is shared across all CPUs means that whatever is at the + * front of the queue is likely to be executed fairly quickly given enough + * number of CPUs. The FIFO scheduling mode may be beneficial to some workloads + * but comes with the usual problems with FIFO scheduling where saturating + * threads can easily drown out interactive ones. + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ +#include + +char _license[] SEC("license") = "GPL"; + +const volatile bool fifo_sched = true; + +static u64 vtime_now; +UEI_DEFINE(uei); + +/* + * Built-in DSQs such as SCX_DSQ_GLOBAL cannot be used as priority queues + * (meaning, cannot be dispatched to with scx_bpf_dsq_insert_vtime()). We + * therefore create a separate DSQ with ID 0 that we dispatch to and consume + * from. If scx_simple only supported global FIFO scheduling, then we could just + * use SCX_DSQ_GLOBAL. + */ +#define SHARED_DSQ 0 + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(u64)); + __uint(max_entries, 2); /* [local, global] */ +} stats SEC(".maps"); + +static void stat_inc(u32 idx) +{ + u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx); + if (cnt_p) + (*cnt_p)++; +} + +s32 BPF_STRUCT_OPS(simple_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) +{ + bool is_idle = false; + s32 cpu; + + cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle); + if (is_idle) { + stat_inc(0); /* count local queueing */ + scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0); + } + + return cpu; +} + +void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags) +{ + stat_inc(1); /* count global queueing */ + + if (fifo_sched) { + scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags); + } else { + u64 vtime = p->scx.dsq_vtime; + + /* + * Limit the amount of budget that an idling task can accumulate + * to one slice. + */ + if (time_before(vtime, vtime_now - SCX_SLICE_DFL)) + vtime = vtime_now - SCX_SLICE_DFL; + + scx_bpf_dsq_insert_vtime(p, SHARED_DSQ, SCX_SLICE_DFL, vtime, enq_flags); + } +} + +void BPF_STRUCT_OPS(simple_dispatch, s32 cpu, struct task_struct *prev) +{ + scx_bpf_dsq_move_to_local(SHARED_DSQ); +} + +void BPF_STRUCT_OPS(simple_running, struct task_struct *p) +{ + if (fifo_sched) + return; + + /* + * Global vtime always progresses forward as tasks start executing. The + * test and update can be performed concurrently from multiple CPUs and + * thus racy. Any error should be contained and temporary. Let's just + * live with it. + */ + if (time_before(vtime_now, p->scx.dsq_vtime)) + vtime_now = p->scx.dsq_vtime; +} + +void BPF_STRUCT_OPS(simple_stopping, struct task_struct *p, bool runnable) +{ + if (fifo_sched) + return; + + /* + * Scale the execution time by the inverse of the weight and charge. + * + * Note that the default yield implementation yields by setting + * @p->scx.slice to zero and the following would treat the yielding task + * as if it has consumed all its slice. If this penalizes yielding tasks + * too much, determine the execution time by taking explicit timestamps + * instead of depending on @p->scx.slice. + */ + p->scx.dsq_vtime += (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight; +} + +void BPF_STRUCT_OPS(simple_enable, struct task_struct *p) +{ + p->scx.dsq_vtime = vtime_now; +} + +s32 BPF_STRUCT_OPS_SLEEPABLE(simple_init) +{ + return scx_bpf_create_dsq(SHARED_DSQ, -1); +} + +void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +SCX_OPS_DEFINE(simple_ops, + .select_cpu = (void *) simple_select_cpu, + .enqueue = (void *) simple_enqueue, + .dispatch = (void *) simple_dispatch, + .running = (void *) simple_running, + .stopping = (void *) simple_stopping, + .enable = (void *) simple_enable, + .init = (void *) simple_init, + .exit = (void *) simple_exit, + .name = "simple"); diff --git a/selftest/struct-ops/main.go b/selftest/struct-ops/main.go new file mode 100644 index 0000000..1da9de4 --- /dev/null +++ b/selftest/struct-ops/main.go @@ -0,0 +1,121 @@ +package main + +import ( + "context" + "log" + "os" + "os/signal" + "sync" + "syscall" + "time" + + "encoding/binary" + "unsafe" + + bpf "github.com/aquasecurity/libbpfgo" +) + +func endian() binary.ByteOrder { + var i int32 = 0x01020304 + u := unsafe.Pointer(&i) + pb := (*byte)(u) + b := *pb + if b == 0x04 { + return binary.LittleEndian + } + + return binary.BigEndian +} + +func main() { + bpfModule, err := bpf.NewModuleFromFileArgs(bpf.NewModuleArgs{ + BPFObjPath: "main.bpf.o", + KernelLogLevel: 0, + }) + if err != nil { + os.Exit(-1) + } + defer bpfModule.Close() + + if err := bpfModule.BPFLoadObject(); err != nil { + os.Exit(-1) + } + + m := bpfModule + + var afterFunc func() + + iters := m.Iterator() + for { + m := iters.NextMap() + if m == nil { + break + } + if m.Type().String() == "BPF_MAP_TYPE_STRUCT_OPS" { + var link *bpf.BPFLink + if link, err = m.AttachStructOps(); err != nil { + log.Printf("error: %v", err) + os.Exit(-1) + } + afterFunc = func() { + if err := link.Destroy(); err != nil { + log.Printf("error: %v", err) + os.Exit(-1) + } + } + } + } + + var statsMap *bpf.BPFMap + if statsMap, err = bpfModule.GetMap("stats"); err != nil { + log.Printf("error: %v", err) + os.Exit(-1) + } + var wg sync.WaitGroup + ctx, cancel := context.WithCancel(context.Background()) + signalChan := make(chan os.Signal, 1) + signal.Notify(signalChan, syscall.SIGINT, syscall.SIGTERM) + wg.Add(1) + go func(ctx context.Context) { + for true { + select { + case <-ctx.Done(): + wg.Done() + return + default: + res := getStat(statsMap) + log.Printf("local: %d, global: %d", res[0], res[1]) + } + time.Sleep(1 * time.Second) + } + }(ctx) + time.Sleep(3 * time.Second) + cancel() + wg.Wait() + afterFunc() + log.Println("scheduler exit") + os.Exit(0) +} + +func getStat(m *bpf.BPFMap) []uint64 { + cpuNum, err := bpf.NumPossibleCPUs() + if err != nil { + log.Fatal(err) + } + cnts := make([][]uint64, 2) + cnts[0] = make([]uint64, cpuNum) + cnts[1] = make([]uint64, cpuNum) + stats := []uint64{0, 0} + for i := 0; i < 2; i++ { + v, err := m.GetValue(unsafe.Pointer(&i)) + if err != nil { + log.Fatal(err) + } + for cpu := 0; cpu < cpuNum; cpu++ { + n := v[cpu*8 : cpu*8+8] + cnts[i][cpu] = endian().Uint64(n) + stats[i] += cnts[i][cpu] + } + } + return stats +} diff --git a/selftest/struct-ops/run-vm.sh b/selftest/struct-ops/run-vm.sh new file mode 120000 index 0000000..7469344 --- /dev/null +++ b/selftest/struct-ops/run-vm.sh @@ -0,0 +1 @@ +../common/run-vm-stage2.sh \ No newline at end of file diff --git a/selftest/struct-ops/run.sh b/selftest/struct-ops/run.sh new file mode 100755 index 0000000..3a9e486 --- /dev/null +++ b/selftest/struct-ops/run.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +TEST=$(dirname $0)/$1 # execute +TIMEOUT=10 # seconds +KERNEL_VERSION=v6.12.2 # kernel version + +# SETTINGS +COMMON="$(dirname $0)/../common/common.sh" + +vng -v -r $KERNEL_VERSION --rodir="$(realpath ..)" -- "export TEST=$TEST COMMON=$COMMON TIMEOUT=$TIMEOUT; ./run-vm.sh" + +exit 0