diff --git a/assets/syscall/flow.jpg b/assets/syscall/flow.jpg new file mode 100755 index 00000000..5e85b84d Binary files /dev/null and b/assets/syscall/flow.jpg differ diff --git a/assets/syscall/kernel-livepatching1.png b/assets/syscall/kernel-livepatching1.png new file mode 100644 index 00000000..80a5508c Binary files /dev/null and b/assets/syscall/kernel-livepatching1.png differ diff --git a/assets/syscall/kernel-livepatching2.png b/assets/syscall/kernel-livepatching2.png new file mode 100644 index 00000000..92a0f6ca Binary files /dev/null and b/assets/syscall/kernel-livepatching2.png differ diff --git a/examples/Makefile b/examples/Makefile index 95558a7b..47586455 100644 --- a/examples/Makefile +++ b/examples/Makefile @@ -17,6 +17,7 @@ obj-m += kbleds.o obj-m += sched.o obj-m += chardev2.o obj-m += syscall.o +obj-m += syscall-ftrace.o obj-m += intrpt.o obj-m += cryptosha256.o obj-m += cryptosk.o diff --git a/examples/syscall-ftrace.c b/examples/syscall-ftrace.c new file mode 100644 index 00000000..112ba962 --- /dev/null +++ b/examples/syscall-ftrace.c @@ -0,0 +1,227 @@ +/** + * syscall-ftrace.c + * + * System call "stealing" with ftrace + * + * We create a callback function that contains + * an unconditional jump to our spying function, + * which will then return control to the original one. + * + * The callback function is triggered by ftrace. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); + +#define MAX_FILENAME_SIZE 200 + +#undef pr_fmt +#define pr_fmt(fmt) "[syscall-ftrace] " fmt + +/** UID we want to spy on - will be filled from the command line. */ +static int uid = 0; +module_param(uid, int, 0644); + +/** + * This is a housekeeping structure that saves all information + * needed for hooking. Usage with `PREPARE_HOOK` is recommended. + * + * Example: + * static ftrace_hook_t sys_clone_hook = + * PREPARE_HOOK(__NR_clone, my_sys_clone, &orig_sys_clone) + */ +typedef struct ftrace_hook { + unsigned long nr; // syscall number from unistd.h + void *new; // hook function + void *orig; // original function + + unsigned long address; // address to the original function + struct ftrace_ops ops; // ftrace structure +} ftrace_hook_t; + +#define PREPARE_HOOK(_nr, _hook, _orig) \ + { \ + .nr = (_nr), .new = (_hook), .orig = (_orig) \ + } + +static unsigned long **sys_call_table; + +/** + * For the sake of simplicity, only the kprobe method is included. + * If you want to know more about different methods to get + * kallsyms_lookup_name, see syscall.c. + */ +static int resolve_address(ftrace_hook_t *hook) +{ + static struct kprobe kp = { .symbol_name = "kallsyms_lookup_name" }; + unsigned long (*kallsyms_lookup_name)(const char *name); + + register_kprobe(&kp); + kallsyms_lookup_name = (unsigned long (*)(const char *))kp.addr; + unregister_kprobe(&kp); + + if (!kallsyms_lookup_name) { + pr_err("kallsyms_lookup_name is not found!\n"); + return -1; + } + pr_info("kallsyms_lookup_name is found at 0x%lx\n", + (unsigned long)kallsyms_lookup_name); + + sys_call_table = (unsigned long **)kallsyms_lookup_name("sys_call_table"); + if (!sys_call_table) { + pr_err("sys_call_table is not found!\n"); + return -1; + } + pr_info("sys_call_table is found at 0x%lx\n", + (unsigned long)sys_call_table); + + hook->address = (unsigned long)sys_call_table[hook->nr]; + *((unsigned long *)hook->orig) = hook->address; + return 0; +} + +/** + * This is where the magic happens. + * + * We check whether this function is called by the kernel or this module + * by checking whether parent_ip is within this module. + * + * During the first call, parent_ip points to somewhere in the kernel + * that's not in this module, + * while the second call is in this module + * since it's called from our_sys_openat. + * + * If it is the first call, we modify ip to be our_sys_openat, + * which will pass control to it after ftrace is done. + */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 11, 0) +static void notrace ftrace_thunk(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ops, + struct ftrace_regs *fregs) +{ + ftrace_hook_t *hook = container_of(ops, ftrace_hook_t, ops); + + if (!within_module(parent_ip, THIS_MODULE)) + fregs->regs.ip = (unsigned long)hook->new; +} + +#else /** Version < v5.11 */ +static void notrace ftrace_thunk(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ops, struct pt_regs *regs) +{ + ftrace_hook_t *hook = container_of(ops, ftrace_hook_t, ops); + + if (!within_module(parent_ip, THIS_MODULE)) + regs->ip = (unsigned long)hook->new; +} + +#endif /** Version >= v5.11 */ + +static int install_hook(ftrace_hook_t *hook) +{ + int err; + + err = resolve_address(hook); + if (err) + return err; + + /** The callback function */ + hook->ops.func = ftrace_thunk; + /** We need registers and we're modifying ip */ + hook->ops.flags = FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_IPMODIFY; + /** Only sys_openat should be traced */ + err = ftrace_set_filter_ip(&hook->ops, hook->address, 0, 0); + if (err) { + pr_err("ftrace_set_filter_ip() failed: %d\n", err); + return err; + } + + err = register_ftrace_function(&hook->ops); + if (err) { + pr_err("register_ftrace_function() failed: %d\n", err); + return err; + } + + return 0; +} + +static void remove_hook(ftrace_hook_t *hook) +{ + int err; + + err = unregister_ftrace_function(&hook->ops); + if (err) + pr_err("unregister_ftrace_function() failed: %d\n", err); + + /** Disable the trace by setting remove to 1 */ + err = ftrace_set_filter_ip(&hook->ops, hook->address, 1, 0); + if (err) + pr_err("ftrace_set_filter_ip() failed: %d\n", err); +} + +/** For some reason the kernel segfaults when the parameters are expanded. */ +static asmlinkage long (*original_call)(struct pt_regs *regs); +static asmlinkage long our_sys_openat(struct pt_regs *regs) +{ + char *kfilename; + int errcode = 0; + + if (current->cred->uid.val != uid) + return original_call(regs); + kfilename = kmalloc(MAX_FILENAME_SIZE * sizeof(char), GFP_KERNEL); + if (!kfilename) + return original_call(regs); + + /** + * This may only work in x86_64 because getting parameters + * from CPU registers is architecture-dependent. + * + * Change regs->si to appropriate registers + * if you are trying on different architecture. + */ + errcode = + copy_from_user(kfilename, (char __user *)regs->si, MAX_FILENAME_SIZE); + if (errcode < 0) { + kfree(kfilename); + return original_call(regs); + } + + pr_info("File opened by UID %d: %s\n", uid, kfilename); + kfree(kfilename); + + return original_call(regs); +} + +static ftrace_hook_t sys_openat_hook = + PREPARE_HOOK(__NR_openat, our_sys_openat, &original_call); + +static int __init syscall_ftrace_start(void) +{ + int err; + + err = install_hook(&sys_openat_hook); + if (err) + return err; + pr_info("hooked, spying on UID %d\n", uid); + return 0; +} + +static void __exit syscall_ftrace_end(void) +{ + remove_hook(&sys_openat_hook); + pr_info("removed\n"); +} + +module_init(syscall_ftrace_start); +module_exit(syscall_ftrace_end); diff --git a/lkmpg.tex b/lkmpg.tex index 29ffc906..04285bb2 100644 --- a/lkmpg.tex +++ b/lkmpg.tex @@ -1554,6 +1554,102 @@ \section{System Calls} \samplec{examples/syscall.c} +Another technique we can utilize to control the flow of execution of a syscall is \verb|ftrace|. +It is an internal tracer designed to help out developers and designers of systems to find what is going on inside the kernel. +It can be used for debugging or analyzing latencies and performance issues that take place outside of user-space. +It is usually used as an event tracer by attaching callbacks to the beginning of functions in order to record and trace the flow of the kernel. + +\begin{code} +struct ftrace_ops { + ftrace_func_t func; // callback function + unsigned long flags; // ftrace flags + void* private; // any private data +}; +void callback_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ops, struct pt_regs *regs); +\end{code} + +Above is the basic structure of the callback function, where the parameters are + +\begin{itemize} + \item \cpp|ip|: The instruction pointer of the function being traced. + \item \cpp|parent_ip|: The instruction pointer of the caller of the traced function. + \item \cpp|ops|: A pointer to \cpp|ftrace_ops| that was used to register the callback. + \item \cpp|regs|: If \cpp|FTRACE_OPS_FL_SAVE_REGS| or \cpp|FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED| are set in the \cpp|ftrace_ops| structure, + then this will be pointing to the \cpp|pt_regs| structure like it would be if an breakpoint was placed at the start of the function where \verb|ftrace| was tracing for CPU register access. + Otherwise it either contains garbage, or \cpp|NULL|. + Do notice that in kernel version later than v5.11, this is replaced with \cpp|struct ftrace_regs *fregs|, with the original \cpp|pt_regs| accessible by \cpp|fregs->regs|. +\end{itemize} + +Internally, there's a 5-byte \cpp|call| to \cpp|__fentry__| at the beginning (BEFORE function prologue) of a traceable kernel function, which is converted to \cpp|nop| during boot to prevent overhead. When a trace is registered, it is changed back to \cpp|__fentry__| and the registered callback will be executed accordingly. + +But callbacks can do more. +Since it's called at the start of a function, +and we have access to CPU registers, +maybe we can ``hijack'' the traced function by modifying the instruction pointer? +Yes, this is possible by enabling \cpp|FTRACE_OPS_FL_IPMODIFY| flag when registering a trace. +It will allow us to modify the instruction pointer register, which will become an unconditional jump after the \verb|ftrace| function. +Note that while there can be multiple tracers on one function, only one tracer that changes \cpp|ip| can be registered at the same time. + +Figure~\ref{img:ftrace-hooking-example} gives an example of auditing \cpp|sys_execve| by hooking it using \verb|ftrace|. +The callback function (\cpp|fh_ftrace_thunk|) checks whether the call is from the kernel or the module, +and passes control accordingly. +If the call is from the kernel, our auditing function is called. +Otherwise, nothing happens. +The check is important because we're only ``decorating'' the original syscall. +Our auditing function contains call to the original \cpp|sys_execve|, +which will trigger the callback function again. +It'll be an infinite loop if there's no check performed. + +\begin{figure}[h] + \centering + \includegraphics[width=\textwidth]{assets/syscall/flow.jpg} + \caption{Linux kernel hooking with ftrace \href{https://www.apriorit.com/dev-blog/546-hooking-linux-functions-2}{Source}} + \label{img:ftrace-hooking-example} +\end{figure} + +In fact, this is what live kernel patches uses. +By redirecting the flow of execution, +end users can use patched functions instead of vulnerable ones without reboot, as figure~\ref{img:kernel-livepatching} shows. + +\begin{figure}[h] + \centering + \includegraphics[width=\textwidth]{assets/syscall/kernel-livepatching1.png}\\ + \vspace{1cm} + \includegraphics[width=\textwidth]{assets/syscall/kernel-livepatching2.png} + \caption{How live kernel patching works. \href{https://ubuntu.com/blog/an-overview-of-live-kernel-patching}{Source}} + \label{img:kernel-livepatching} +\end{figure} + +For more information regarding \verb|ftrace|, check out \href{https://www.kernel.org/doc/html/latest/trace/ftrace.html}{the kernel documentation} and \href{https://youtu.be/93uE_kWWQjs}{this talk from Steven Rostedt}. + +Before getting our hands dirty, here are some functions we need to know. + +\begin{itemize} + \item \cpp|register_ftrace_function(struct ftrace_ops *ops)|: Enable tracing call defined by \cpp|ops| + \item \cpp|unregister_ftrace_function(struct ftrace_ops *ops)|: Disable tracing call defined by \cpp|ops| + \item \cpp|ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, int len, int reset)|: Denote which function should be enabled for tracing by its name. If \cpp|buf| is \cpp|NULL|, all functions will be enabled. + \item \cpp|ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip, int remove, int reset)|: Denote which function should be enabled for tracing by its address. \cpp|remove| should be \cpp|0| when adding a trace, and \cpp|1| when removing one. Note that \cpp|ip| must be the address where the call to \cpp|__fentry__| is located in the function. +\end{itemize} + +Alright let's write some code. +Below is the source code of the example from above, but rewritten using \verb|ftrace|. +The main difference is the \cpp|install_hook| function, +which prepares our tracee function (\cpp|sys_openat|), +and the callback function (\cpp|ftrace_thunk|). +We need both \cpp|FTRACE_OPS_FL_SAVE_REGS| and \cpp|FTRACE_OPS_FL_IPMODIFY| because we're modifying \cpp|ip|. +Inside \cpp|ftrace_thunk| is what the magic happens. +We check if it is called from within the module, +if not then it modifies the instruction pointer to our ``spying'' function. +The check is performed by checking whether \cpp|parent_ip| is within this module. +During the first call, \cpp|parent_ip| points to somewhere within the kernel, +while during the second call it points to somewhere in our ``spying'' function, which is within the module. + +When inserting the module, you should provide the uid you want to spy on as an parameter. +For example, you can spy on yourself by \verb|sudo insmod syscall-ftrace.ko uid=$UID|. + +\samplec{examples/syscall-ftrace.c} + \section{Blocking Processes and threads} \label{sec:blocking_process_thread} \subsection{Sleep}