Linux内核——进程的虚拟地址空间

跟着简叔学的，可以B站搜索 简说linux

进程虚拟地址空间的管理机制

什么是虚拟地址空间

内存的页表映射机制实现虚拟地址到物理地址的转换。内核只需维护一个页目录表，而多个进程共享用户空间是不安全的，需要做管理,不然用户空间太大，若每个进程也只维护一个页目录表的话，查找管理不便。那么虚拟地址空间的管理机制是什么样的呢。

1678148132831

在linux中，proc是一个虚拟文件系统，也是一个控制中心，里面储存是当前内核运行状态的一系列特殊文件；该系统只存在内存当中，以文件系统的方式为访问系统内核数据的操作提供接口，可以通过更改其中的某些文件来改变内核运行状态。

写一个最简单的c语言程序并运行

! 1678148938760

使用top命令查看进程的ID,

1678148993319

使用cat /proc/232759/maps可以查看进程的虚拟地址空间的信息

1678149193044

含义：

1678149419801

虚拟地址空间的管理

内核使用下面的数据结构来描述一个进程的的虚拟地址空间，文件路径/include/linux/mm_types.h，其中第一个成员 vm_area_struct被用来描述一个虚拟内存区域（VMA），一个地址空间由多个虚拟内存区域，可以看到是一个链表的形式。pgd_t * pgd 指向页目录表，地址映射必备，如果两个进程有同一个页目录表，那这两个进程甚至可以当成线程使用。

struct mm_struct {
	struct vm_area_struct *mmap;		/* list of VMAs */
	struct rb_root mm_rb;
	u64 vmacache_seqnum;                   /* per-thread vmacache */
#ifdef CONFIG_MMU
	unsigned long (*get_unmapped_area) (struct file *filp,
				unsigned long addr, unsigned long len,
				unsigned long pgoff, unsigned long flags);
#endif
	unsigned long mmap_base;		/* base of mmap area */
	unsigned long mmap_legacy_base; /* base of mmap area in bottom-up allocations */
	unsigned long task_size;		/* size of task vm space */
	unsigned long highest_vm_end;   /* highest vma end address */
	pgd_t * pgd;                    //页目录表的地址
	atomic_t mm_users;	/* How many users with user space? */
	atomic_t mm_count;  /* How many references to "struct mm_struct" (users count as 1) */
	atomic_long_t nr_ptes;			/* PTE page table pages */
#if CONFIG_PGTABLE_LEVELS > 2
	atomic_long_t nr_pmds;			/* PMD page table pages */
#endif
	int map_count;				    /* number of VMAs */

	spinlock_t page_table_lock;		/* Protects page tables and some counters */
	struct rw_semaphore mmap_sem;

	struct list_head mmlist;		/* List of maybe swapped mm's.	These are globally strung
						 * together off init_mm.mmlist, and are protected
						 * by mmlist_lock
						 */


	unsigned long hiwater_rss;	/* High-watermark of RSS usage */
	unsigned long hiwater_vm;	/* High-water virtual memory usage */

	unsigned long total_vm;		/* Total pages mapped */
	unsigned long locked_vm;	/* Pages that have PG_mlocked set */
	unsigned long pinned_vm;	/* Refcount permanently increased */
	unsigned long data_vm;		/* VM_WRITE & ~VM_SHARED & ~VM_STACK */
	unsigned long exec_vm;		/* VM_EXEC & ~VM_WRITE & ~VM_STACK */
	unsigned long stack_vm;		/* VM_STACK */
	unsigned long def_flags;
	unsigned long start_code, end_code, start_data, end_data;
	unsigned long start_brk, brk, start_stack;
	unsigned long arg_start, arg_end, env_start, env_end;

	unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */

	/*
	 * Special counters, in some configurations protected by the
	 * page_table_lock, in other configurations by being atomic.
	 */
	struct mm_rss_stat rss_stat;

	struct linux_binfmt *binfmt;

	cpumask_var_t cpu_vm_mask_var;

	/* Architecture-specific MM context */
	mm_context_t context;

	unsigned long flags; /* Must use atomic bitops to access the bits */

	struct core_state *core_state; /* coredumping support */
#ifdef CONFIG_AIO
	spinlock_t			ioctx_lock;
	struct kioctx_table __rcu	*ioctx_table;
#endif
#ifdef CONFIG_MEMCG
	/*
	 * "owner" points to a task that is regarded as the canonical
	 * user/owner of this mm. All of the following must be true in
	 * order for it to be changed:
	 *
	 * current == mm->owner
	 * current->mm != mm
	 * new_owner->mm == mm
	 * new_owner->alloc_lock is held
	 */
	struct task_struct __rcu *owner;
#endif
	struct user_namespace *user_ns;

	/* store ref to file /proc/<pid>/exe symlink points to */
	struct file __rcu *exe_file;
#ifdef CONFIG_MMU_NOTIFIER
	struct mmu_notifier_mm *mmu_notifier_mm;
#endif
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
	pgtable_t pmd_huge_pte; /* protected by page_table_lock */
#endif
#ifdef CONFIG_CPUMASK_OFFSTACK
	struct cpumask cpumask_allocation;
#endif
#ifdef CONFIG_NUMA_BALANCING
	/*
	 * numa_next_scan is the next time that the PTEs will be marked
	 * pte_numa. NUMA hinting faults will gather statistics and migrate
	 * pages to new nodes if necessary.
	 */
	unsigned long numa_next_scan;

	/* Restart point for scanning and setting pte_numa */
	unsigned long numa_scan_offset;

	/* numa_scan_seq prevents two threads setting pte_numa */
	int numa_scan_seq;
#endif
#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
	/*
	 * An operation with batched TLB flushing is going on. Anything that
	 * can move process memory needs to flush the TLB when moving a
	 * PROT_NONE or PROT_NUMA mapped page.
	 */
	bool tlb_flush_pending;
#endif
#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
	/* See flush_tlb_batched_pending() */
	bool tlb_flush_batched;
#endif
	struct uprobes_state uprobes_state;
#ifdef CONFIG_X86_INTEL_MPX
	/* address of the bounds directory */
	void __user *bd_addr;
#endif
#ifdef CONFIG_HUGETLB_PAGE
	atomic_long_t hugetlb_usage;
#endif
	struct work_struct async_put_work;
};

那我们再来看看虚拟内存区域的（VMA）这个结构，内核将每个内存区域当作一个单独的内存对象管理，因此我们程序的代码段、数据段、未初始化全局变量、堆栈这种都有自己的 vm_area_struct然后再由虚拟地址空间把这些 vm_area_struct给串起来。

这里面有一个mm_struct指针而mm_struct又与tast_struct有一一的对应关系，因此就可以知道这块虚拟内存区域属于的哪一个进程。

/*
 * This struct defines a memory VMM memory area. There is one of these
 * per VM-area/task.  A VM area is any part of the process virtual memory
 * space that has a special rule for the page-fault handlers (ie a shared
 * library, the executable area etc).
 */
struct vm_area_struct {
	/* The first cache line has the info for VMA tree walking. */

	unsigned long vm_start;		/* Our start address within vm_mm. */
	unsigned long vm_end;		/* The first byte after our end address
					   within vm_mm. */

	/* linked list of VM areas per task, sorted by address */
	struct vm_area_struct *vm_next, *vm_prev;

	struct rb_node vm_rb;

	/*
	 * Largest free memory gap in bytes to the left of this VMA.
	 * Either between this VMA and vma->vm_prev, or between one of the
	 * VMAs below us in the VMA rbtree and its ->vm_prev. This helps
	 * get_unmapped_area find a free area of the right size.
	 */
	unsigned long rb_subtree_gap;

	/* Second cache line starts here. */

	struct mm_struct *vm_mm;	/* The address space we belong to. */
	pgprot_t vm_page_prot;		/* Access permissions of this VMA. */
	unsigned long vm_flags;		/* Flags, see mm.h. 就是权限 可读可写还是什么的  */

	/*
	 * For areas with an address space and backing store,
	 * linkage into the address_space->i_mmap interval tree.
	 */
	struct {
		struct rb_node rb;
		unsigned long rb_subtree_last;
	} shared;

	/*
	 * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
	 * list, after a COW of one of the file pages.	A MAP_SHARED vma
	 * can only be in the i_mmap tree.  An anonymous MAP_PRIVATE, stack
	 * or brk vma (with NULL file) can only be in an anon_vma list.
	 */
	struct list_head anon_vma_chain; /* Serialized by mmap_sem &
					  * page_table_lock */
	struct anon_vma *anon_vma;	/* Serialized by page_table_lock */

	/* Function pointers to deal with this struct. */
	const struct vm_operations_struct *vm_ops;

	/* Information about our backing store: */
	unsigned long vm_pgoff;		/* Offset (within vm_file) in PAGE_SIZE
					   units */
	struct file * vm_file;		/* File we map to (can be NULL). */
	void * vm_private_data;		/* was vm_pte (shared mem) */

#ifndef CONFIG_MMU
	struct vm_region *vm_region;	/* NOMMU mapping region */
#endif
#ifdef CONFIG_NUMA
	struct mempolicy *vm_policy;	/* NUMA policy for the VMA */
#endif
	struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
};

下面这个图太牛了

1678151170442

使用以下函数可以把文件映射到进程的地址空间里，具体就是再给这个进程分配一个VMA并把这个东西加到mm_struct里边

1
2
3

#include <sys/mman.h>
void *mmap(void *addr， size_t length, int prot, int flags,int fd, off_t offset);
int munmap(void *addr, size_t length) ;

在linux中，如果clone()时设置CLONE_VM标志，我们把这样的进程称作为线程。线程之间共享同样的虚拟内存空间。fork()函数利用copy_mm()函数复制父进程的mm_struct,也就是current->mm域给其子进程。(/kernel/fork.c) 参数中的struct task_struct *tsk是子进程的task struct

#include <kernel/fork.c>
static int copy_mm (unsigned long clone_flags,struct task_struct *tsk){}

static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
{
	struct mm_struct *mm, *oldmm;
	int retval;

	tsk->min_flt = tsk->maj_flt = 0;
	tsk->nvcsw = tsk->nivcsw = 0;
#ifdef CONFIG_DETECT_HUNG_TASK
	tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
#endif

	tsk->mm = NULL;
	tsk->active_mm = NULL;

	/*
	 * Are we cloning a kernel thread?
	 *
	 * We need to steal a active VM for that..
	 */
    //取当前进程的mm
	oldmm = current->mm;
	if (!oldmm)
		return 0;

	/* initialize the new vmacache entries */
	vmacache_flush(tsk);

	if (clone_flags & CLONE_VM) {
		atomic_inc(&oldmm->mm_users);
		mm = oldmm;
		goto good_mm;
	}

	retval = -ENOMEM;
	mm = dup_mm(tsk);
	if (!mm)
		goto fail_nomem;

//把mm赋给子进程的mm

good_mm:
	tsk->mm = mm;
	tsk->active_mm = mm;
	return 0;

fail_nomem:
	return retval;
}

用户空间的mmap函数在内核中的实现

用户空间使用mmap函数可以将一个文件或者设备映射到进程的地址空间去，上面说的，创建一个VMA加进去，到底是不是这样呢？映射的作用是，这样读写文件会快，如果不这么做的话，还会有用户空间和内核空间的数据拷贝过程，数据多的时候会极大影响性能。

用户空间的mmap()会通过系统调用调用到内核的do_mmap()函数。
do_mmap()函数会:
1.首先创建一个新的VMA并初始化，然后加入进程的虚拟地址空间里。
2.然后调用底层的mmap函数建立VMA和实际物理地址的联系（建立页表) 底层的mmap会根据文件类型不同有差异化。

驱动的mmap实现

设备驱动的mmap实现主要是将这个物理设备的可操作区域映射到一个进程的虚拟地址空间。这样用户空间就可以直接采用指针的方式访问设备的可操作区域。在驱动中的mmap实现主要是完成一件事，就是建立设备的可操作区域到进程虚拟空间地址的映射过程。同时也需要保证这段映射的虚拟存储器区域不会被进程当做一般的空间使用，因此需要添加一系列的保护方式。

驱动的mmap建立虚拟地址和物理地址的映射

//建立vma和物理地址的映射的工作由remap_pfn_range来完成，原型如下:
int  remap_pfn_range(struct vm_area_struct *vma，, unsigned long
virt_addr，unsigned long pfn，unsigned long size，pgprot_t prot)
/*
vma            需要建立映射的VMA
virt_addr      需要建立映射的VMA的起始地址
pfn            页帧号，对应虚拟地址应当被映射的物理地址．这个页帧号简单地是物理地址右移 PAGE_SHIFT位 一般是12位
size           需要建立映射的VMA的大小，以字节.
prot           使用在vma->vm_page _prot中找到的值.
*/

我们可以重新回去看看file_operation这个结构体，这个就代表这驱动提供的能力，也就是要让驱动有mmap的能力，就需要实现 int(*mmap)函数，建立file 与 VMA的映射。就是用上面的函数

1678153727517

1	int (mmap) (struct file , struct vm_area_struct *);

因此mmap是一种零拷贝技术。

进程的用户栈和内核栈

用户栈:
基于进程的虚拟地址空间的管理机制实现;以VMA的形式实现;
内核栈:
每个进程都有属于自己的独自的内核栈;
大小根据不同的体系结构而不同，一般为1个page，也就是4K;

linux内核中进程的描述方式

/include/linux/sched.h 有点长不忘往这里放了吧一个struct 四五百行，，，描述了linux进程的通用部分。然后它里面有一个成员结构体是，thread_info 描述了特定体系结构的汇编代码段需要访问的那部分数据。

1678169995715

内核栈定义如下：

1678170171116

thread_info包含了自身体系架构的一些属性，不同体系有差异所以要单独实现，比如下面是arm的。

arch/arm/include/asm/thread_info.h

struct thread_info {
	unsigned long		flags;		/* low level flags */
	int			preempt_count;	/* 0 => preemptable, <0 => bug */
	mm_segment_t		addr_limit;	/* address limit */
	struct task_struct	*task;		/* main task structure */
	__u32			cpu;		/* cpu */
	__u32			cpu_domain;	/* cpu domain */
	struct cpu_context_save	cpu_context;	/* cpu context */
	__u32			syscall;	/* syscall number */
	__u8			used_cp[16];	/* thread used copro */
	unsigned long		tp_value[2];	/* TLS registers */
#ifdef CONFIG_CRUNCH
	struct crunch_state	crunchstate;
#endif
	union fp_state		fpstate __attribute__((aligned(8)));
	union vfp_state		vfpstate;
#ifdef CONFIG_ARM_THUMBEE
	unsigned long		thumbee_state;	/* ThumbEE Handler Base register */
#endif
};