映射关系
GVA -> GPA 映射关系 保存在 Guest OS 页表中。
GPA -> HVA 映射关系 保存在 kvm memslot 数组中。
GPA -> HPA 映射关系 保存在 kvm ept 页表中(或者影子页表)。
HVA -> HPA 映射关系 保存在 Host OS 页表中。
相关代码
qemu 进程会准备好可用虚拟内存空间,类似于物理机的内存条
因地址空间不连续的原因,会注册多个 memslot 到 kvm 模块,相关字段解释如下
struct kvm_memory_slot {
struct hlist_node id_node[2];
struct interval_tree_node hva_node[2];
struct rb_node gfn_node[2];
gfn_t base_gfn; // guest物理页帧号
unsigned long npages; // guest 物理页面个数
unsigned long *dirty_bitmap;
struct kvm_arch_memory_slot arch;
unsigned long userspace_addr; // hva 用户空间虚拟地址, qemu 进程分配到的虚拟内存空间首地址
u32 flags;
short id; // slot id; 低16位
u16 as_id; // address space id 高16位
};
gfn 转换为 hva
static inline unsigned long
__gfn_to_hva_memslot(const struct kvm_memory_slot *slot, gfn_t gfn)
{
/*
* The index was checked originally in search_memslots. To avoid
* that a malicious guest builds a Spectre gadget out of e.g. page
* table walks, do not let the processor speculate loads outside
* the guest's registered memslots.
*/
unsigned long offset = gfn - slot->base_gfn; //gfn相对guest起始物理页帧号base_gfn的偏移页面个数
offset = array_index_nospec(offset, slot->npages); // 越界检查
return slot->userspace_addr + offset * PAGE_SIZE; // qemu进程用户态虚拟地址空间首地址就是 userspace_addr 再加上页偏移就是hva
}
下面测试代码用的是内核直接导出的函数 gfn_to_hva
gfn_to_hva -> gfn_to_hva_many -> __gfn_to_hva_memslot
地址转换
测试用系统采用5级页表配置,但是虚拟地址空间只有48bit,所以PGD和P4D重叠,二者页表项内容一致。
Guest: PGD 80000001050e4067 P4D 80000001050e4067
Host : PGD 8000000220eaf067 P4D 8000000220eaf067
从下面测试中可以看到guest上的 4K 页,host上却属于 2M 大页。
这是因为host在ept页表中使用大页,可以减少占用TLB条目,提高命中率,提升性能。
Guest
由libc.so.6
的一段地址空间首地址7f34ef90f000
,即 GVA,转换得到 GPA 0x13b483000
# ps
..0....
201 root /sbin/klogd -n
213 root /usr/sbin/crond -f
219 root -sh
220 root /sbin/getty -L tty1 0 vt100
222 root ps
# cat /proc/201/maps
55efe7882000-55efe788e000 r--p 00000000 00:02 458 /bin/busybox
55efe788e000-55efe7914000 r-xp 0000c000 00:02 458 /bin/busybox
55efe7914000-55efe793b000 r--p 00092000 00:02 458 /bin/busybox
55efe793b000-55efe793e000 r--p 000b9000 00:02 458 /bin/busybox
55efe793e000-55efe793f000 rw-p 000bc000 00:02 458 /bin/busybox
55efe7b50000-55efe7b71000 rw-p 00000000 00:00 0 [heap]
7f34ef90c000-7f34ef90f000 rw-p 00000000 00:00 0
7f34ef90f000-7f34ef937000 r--p 00000000 00:02 74 /lib/libc.so.6
# insmod /v2p.ko pid=201 vaddr=0x7f34ef90f000
[ 1727.099719][ T233] PGD 80000001050e4067 P4D 80000001050e4067 PUD 1050e7067 PMD 1016a0067 PTE 800000013b483025
[ 1727.102810][ T233] Physical address for pid 201, virtual address 0x7f34ef90f000: 0x13b483000
HOST
host上根据 GPA 0x13b483000
和 qemu 进程 pid 304786
,转换得到 HVA 0x7fec17283000
# insmod ./gkvm.ko pid=304786 gpa=0x13b483000
# dmesg
[94762.874954] base pfn 0x 100000, npages 262144, hva 0x7febdbe00000 id 1 as_id 0
[94762.875733] base pfn 0x fffc0, npages 64, hva 0x7fec20600000 id 2 as_id 0
[94762.876478] base pfn 0x fee00, npages 1, hva 0x7fec22b91000 id 510 as_id 0
[94762.877181] base pfn 0x fd000, npages 4096, hva 0x7feb0ae00000 id 4 as_id 0
[94762.877896] base pfn 0x 100, npages 786176, hva 0x7feb1bf00000 id 9 as_id 0
[94762.878609] base pfn 0x f0, npages 16, hva 0x7feb1bef0000 id 8 as_id 0
[94762.879313] base pfn 0x e8, npages 8, hva 0x7feb1bee8000 id 7 as_id 0
[94762.880022] base pfn 0x ce, npages 26, hva 0x7feb1bece000 id 6 as_id 0
[94762.880730] base pfn 0x cb, npages 3, hva 0x7feb1becb000 id 5 as_id 0
[94762.881441] base pfn 0x c0, npages 11, hva 0x7feb1bec0000 id 3 as_id 0
[94762.882140] base pfn 0x 0, npages 160, hva 0x7feb1be00000 id 0 as_id 0
[94762.882849] PGD 8000000220eaf067 P4D 8000000220eaf067 PUD 2e31c3067 PMD 8000000449a008e7
[94762.883560] gpa 0x13b483000, hva 0x7fec17283000, hpa 0x449a83000
virt_to_phys 模块
#include <linux/module.h>
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/kvm_host.h>
#include <linux/pid.h>
#include <linux/mm_types.h>
#include <linux/uaccess.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/sched.h>
#include <linux/uaccess.h>
#include <linux/huge_mm.h>
static int pid;
module_param(pid, int, 0);
MODULE_PARM_DESC(pid, "Process ID");
static unsigned long vaddr;
module_param(vaddr, ulong, 0);
MODULE_PARM_DESC(vaddr, "User virtual address");
static int bad_address(void *p)
{
unsigned long dummy;
return get_kernel_nofault(dummy, (unsigned long *)p);
}
static unsigned long v2p(int pid, unsigned long vaddr)
{
struct task_struct *task;
struct mm_struct *mm;
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
struct page *page;
unsigned long paddr = 0;
rcu_read_lock();
task = pid_task(find_vpid(pid), PIDTYPE_PID);
if (!task) {
pr_err("No such process with pid: %d\n", pid);
rcu_read_unlock();
return 0;
}
mm = task->mm;
if (!mm) {
pr_err("Process %d does not have a valid memory descriptor\n", pid);
rcu_read_unlock();
return 0;
}
down_read(&mm->mmap_lock);
pgd = pgd_offset(mm, vaddr);
if (bad_address(pgd))
goto bad;
pr_info("PGD %lx ", pgd_val(*pgd));
if (!pgd_present(*pgd))
goto none;
p4d = p4d_offset(pgd, vaddr);
if (bad_address(p4d))
goto bad;
pr_cont("P4D %lx ", p4d_val(*p4d));
if (!p4d_present(*p4d))
goto none;
if (p4d_large(*p4d)) {
page = p4d_page(*p4d);
paddr = page_to_phys(page) + (vaddr & ~P4D_MASK);
goto out;
}
pud = pud_offset(p4d, vaddr);
if (bad_address(pud))
goto bad;
pr_cont("PUD %lx ", pud_val(*pud));
if (!pud_present(*pud))
goto none;
if (pud_large(*pud)) {
page = pud_page(*pud);
paddr = page_to_phys(page) + (vaddr & ~PUD_MASK);
goto out;
}
pmd = pmd_offset(pud, vaddr);
if (bad_address(pmd))
goto bad;
pr_cont("PMD %lx ", pmd_val(*pmd));
if (!pmd_present(*pmd))
goto none;
if (pmd_large(*pmd)) {
page = pmd_page(*pmd);
paddr = page_to_phys(page) + (vaddr & ~PMD_MASK);
goto out;
}
pte = pte_offset_kernel(pmd, vaddr);
if (bad_address(pte))
goto bad;
pr_cont("PTE %lx ", pte_val(*pte));
if (pte_present(*pte)) {
page = pte_page(*pte);
paddr = page_to_phys(page) + (vaddr & ~PAGE_MASK);
} else {
pr_err("Invalid virtual address: 0x%lx\n", vaddr);
}
out:
pr_cont("\n");
up_read(&mm->mmap_lock);
rcu_read_unlock();
return paddr;
bad:
pr_err("bad address\n");
up_read(&mm->mmap_lock);
rcu_read_unlock();
return paddr;
none:
pr_err("no exist\n");
up_read(&mm->mmap_lock);
rcu_read_unlock();
return paddr;
}
static int __init v2p_init(void)
{
unsigned long paddr = v2p(pid, vaddr);
if (paddr) {
pr_info("Physical address for pid %d, virtual address 0x%lx: 0x%lx\n", pid, vaddr, paddr);
} else {
pr_err("Failed to get physical address\n");
}
return 0;
}
static void __exit v2p_exit(void)
{
pr_info("Module exited\n");
}
module_init(v2p_init);
module_exit(v2p_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("l3b2w1");
MODULE_DESCRIPTION("get physical address from pid and virtual address");
获取memslot信息
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/mm.h>
#include <linux/kvm_host.h>
#include <linux/pid.h>
#include <linux/mm_types.h>
#include <linux/uaccess.h>
static int pid = -1;
module_param(pid, int, 0);
MODULE_PARM_DESC(pid, "PID of the QEMU process");
typedef unsigned long ulong;
static ulong gpa = -1;
module_param(gpa, ulong, 0);
MODULE_PARM_DESC(hva, "one gpa of the QEMU process");
static struct kvm *kvm;
static struct mm_struct *mm;
static struct task_struct *task;
static int bad_address(void *p)
{
unsigned long dummy;
return get_kernel_nofault(dummy, (unsigned long *)p);
}
static unsigned long v2p(int pid, unsigned long vaddr)
{
struct task_struct *task;
struct mm_struct *mm;
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
struct page *page;
unsigned long paddr = 0;
rcu_read_lock();
task = pid_task(find_vpid(pid), PIDTYPE_PID);
if (!task) {
pr_err("No such process with pid: %d\n", pid);
rcu_read_unlock();
return 0;
}
mm = task->mm;
if (!mm) {
pr_err("Process %d does not have a valid memory descriptor\n", pid);
rcu_read_unlock();
return 0;
}
down_read(&mm->mmap_lock);
pgd = pgd_offset(mm, vaddr);
if (bad_address(pgd))
goto bad;
pr_info("PGD %lx ", pgd_val(*pgd));
if (!pgd_present(*pgd))
goto none;
p4d = p4d_offset(pgd, vaddr);
if (bad_address(p4d))
goto bad;
pr_cont("P4D %lx ", p4d_val(*p4d));
if (!p4d_present(*p4d))
goto none;
if (p4d_large(*p4d)) {
page = p4d_page(*p4d);
paddr = page_to_phys(page) + (vaddr & ~P4D_MASK);
goto out;
}
pud = pud_offset(p4d, vaddr);
if (bad_address(pud))
goto bad;
pr_cont("PUD %lx ", pud_val(*pud));
if (!pud_present(*pud))
goto none;
if (pud_large(*pud)) {
page = pud_page(*pud);
paddr = page_to_phys(page) + (vaddr & ~PUD_MASK);
goto out;
}
pmd = pmd_offset(pud, vaddr);
if (bad_address(pmd))
goto bad;
pr_cont("PMD %lx ", pmd_val(*pmd));
if (!pmd_present(*pmd))
goto none;
if (pmd_large(*pmd)) {
page = pmd_page(*pmd);
paddr = page_to_phys(page) + (vaddr & ~PMD_MASK);
goto out;
}
pte = pte_offset_kernel(pmd, vaddr);
if (bad_address(pte))
goto bad;
pr_cont("PTE %lx ", pte_val(*pte));
if (pte_present(*pte)) {
page = pte_page(*pte);
paddr = page_to_phys(page) + (vaddr & ~PAGE_MASK);
} else {
pr_err("Invalid virtual address: 0x%lx\n", vaddr);
}
out:
pr_cont("\n");
up_read(&mm->mmap_lock);
rcu_read_unlock();
return paddr;
bad:
pr_err("bad address\n");
up_read(&mm->mmap_lock);
rcu_read_unlock();
return paddr;
none:
pr_err("no exist\n");
up_read(&mm->mmap_lock);
rcu_read_unlock();
return paddr;
}
static unsigned long gpa_to_hva(struct kvm *kvm, unsigned long gpa)
{
unsigned long hva;
hva = gfn_to_hva(kvm, gpa >> PAGE_SHIFT);
return hva | (gpa & ~PAGE_MASK);
}
static void show_memslots_info(struct kvm *kvm)
{
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
mutex_lock(&kvm->slots_lock);
slots = kvm_memslots(kvm);
kvm_for_each_memslot(memslot, slots) {
pr_err("[%s %d] host, base pfn 0x%7lx, npages %7lu, hva 0x%lx id %4d as_id %d\n",
__func__, __LINE__, (unsigned long)memslot->base_gfn, memslot->npages,
memslot->userspace_addr, memslot->id, memslot->as_id);
}
mutex_unlock(&kvm->slots_lock);
}
static struct kvm *get_kvm_from_pid(int pid)
{
task = pid_task(find_vpid(pid), PIDTYPE_PID);
if (!task) {
pr_err("Task not found for PID: %d\n", pid);
return NULL;
}
mm = get_task_mm(task);
if (!mm) {
pr_err("No mm_struct found for task with PID: %d\n", pid);
return NULL;
}
kvm = mm_kvm(mm);
if (!kvm) {
pr_err("No KVM found for task with PID: %d\n", pid);
return NULL;
}
kvm_get_kvm(kvm); // inc kvm ref
return kvm;
}
static int __init gkvm_init(void)
{
struct kvm *kvm;
unsigned long hva;
unsigned long hpa;
if (pid < 0) {
pr_err("Invalid PID: %d\n", pid);
return -EINVAL;
}
kvm = get_kvm_from_pid(pid);
if (!kvm) {
pr_err("KVM struct not found for PID %d\n", pid);
return -1;
}
show_memslots_info(kvm);
hva = gpa_to_hva(kvm, gpa);
hpa = v2p(pid, hva);
pr_info("gpa 0x%lx, hva 0x%lx, hpa 0x%lx\n", gpa, hva, hpa);
return 0;
}
static void __exit gkvm_exit(void)
{
kvm_put_kvm(kvm);
mmput(mm);
pr_info("KVM module exited\n");
}
module_init(gkvm_init);
module_exit(gkvm_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("l3b2w1");
MODULE_DESCRIPTION("KVM Module to get misc info");
acronyms
pfn host page frame number
hpa host physical address
hva host virtual address
gfn guest frame number
gpa guest physical address
gva guest virtual address
ngpa nested guest physical address
ngva nested guest virtual address
pte page table entry (used also to refer generically to paging structure entries)
gpte guest pte (referring to gfns)
spte shadow pte (referring to pfns)
tdp two dimensional paging (AMD's NPT and Intel's EPT)
参考索引
linux-5.10
系统虚拟化-原理与实现