-
Notifications
You must be signed in to change notification settings - Fork 4
/
kvm-for-linux-v5.5.patch
3088 lines (2884 loc) · 129 KB
/
kvm-for-linux-v5.5.patch
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
From c0a3c88a7bf864146e3ab5b2521afbd570ffd4b3 Mon Sep 17 00:00:00 2001
From: Dongli Zhang <[email protected]>
Date: Sun, 26 Apr 2020 17:41:15 -0700
Subject: [PATCH 1/1] kvm for linux v5.5
Signed-off-by: Dongli Zhang <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 20 +
arch/x86/include/uapi/asm/kvm_para.h | 35 ++
arch/x86/kernel/kvm.c | 116 ++++++
arch/x86/kernel/kvmclock.c | 80 ++++
arch/x86/kvm/cpuid.c | 7 +
arch/x86/kvm/cpuid.h | 4 +
arch/x86/kvm/hyperv.c | 14 +
arch/x86/kvm/lapic.c | 34 ++
arch/x86/kvm/lapic.h | 64 +++
arch/x86/kvm/mmu.h | 6 +
arch/x86/kvm/mmu/mmu.c | 38 ++
arch/x86/kvm/vmx/vmcs.h | 9 +
arch/x86/kvm/vmx/vmenter.S | 102 +++++
arch/x86/kvm/vmx/vmx.c | 43 ++
arch/x86/kvm/x86.c | 589 +++++++++++++++++++++++++++
drivers/cpuidle/driver.c | 9 +
drivers/vfio/pci/vfio_pci.c | 45 ++
drivers/vfio/pci/vfio_pci_config.c | 4 +
drivers/vfio/pci/vfio_pci_intrs.c | 45 ++
drivers/vfio/pci/vfio_pci_private.h | 43 ++
drivers/vfio/pci/vfio_pci_rdwr.c | 27 ++
drivers/vfio/vfio.c | 93 +++++
drivers/vfio/vfio_iommu_type1.c | 48 +++
include/linux/kvm_host.h | 8 +
include/linux/kvm_para.h | 22 +
include/uapi/linux/kvm.h | 20 +
virt/kvm/eventfd.c | 18 +
virt/kvm/irqchip.c | 12 +
virt/kvm/kvm_main.c | 9 +
29 files changed, 1564 insertions(+)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b79cd6aa4075..8e8635661e64 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -662,6 +662,14 @@ struct kvm_vcpu_arch {
u8 nr;
} interrupt;
+ /*
+ * 在以下使用halt_request:
+ * - arch/x86/kvm/vmx/vmx.c|4552| <<handle_rmode_exception>> if (vcpu->arch.halt_request) {
+ * - arch/x86/kvm/vmx/vmx.c|4553| <<handle_rmode_exception>> vcpu->arch.halt_request = 0;
+ * - arch/x86/kvm/vmx/vmx.c|5242| <<handle_invalid_guest_state>> if (vcpu->arch.halt_request) {
+ * - arch/x86/kvm/vmx/vmx.c|5243| <<handle_invalid_guest_state>> vcpu->arch.halt_request = 0;
+ * - arch/x86/kvm/x86.c|6334| <<emulator_halt>> emul_to_vcpu(ctxt)->arch.halt_request = 1;
+ */
int halt_request; /* real mode on Intel only */
int cpuid_nent;
@@ -943,6 +951,18 @@ struct kvm_arch {
u64 disabled_quirks;
+ /*
+ * 在以下设置irqchip_mode:
+ * - arch/x86/kvm/x86.c|4790| <<kvm_vm_ioctl_enable_cap>> kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT;
+ * - arch/x86/kvm/x86.c|4911| <<kvm_arch_vm_ioctl>> kvm->arch.irqchip_mode = KVM_IRQCHIP_KERNEL;
+ *
+ * 在以下使用irqchip_mode:
+ * - arch/x86/kvm/ioapic.h|111| <<ioapic_in_kernel>> int mode = kvm->arch.irqchip_mode;
+ * - arch/x86/kvm/irq.h|71| <<pic_in_kernel>> int mode = kvm->arch.irqchip_mode;
+ * - arch/x86/kvm/irq.h|80| <<irqchip_split>> int mode = kvm->arch.irqchip_mode;
+ * - arch/x86/kvm/irq.h|89| <<irqchip_kernel>> int mode = kvm->arch.irqchip_mode;
+ * - arch/x86/kvm/irq.h|98| <<irqchip_in_kernel>> int mode = kvm->arch.irqchip_mode;
+ */
enum kvm_irqchip_mode irqchip_mode;
u8 nr_reserved_ioapic_pins;
diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
index 2a8e0b6b9805..4129b082b711 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -32,6 +32,36 @@
#define KVM_FEATURE_POLL_CONTROL 12
#define KVM_FEATURE_PV_SCHED_YIELD 13
+/*
+ * commit a4429e53c9b3082b05e51224c3d58dbdd39306c5
+ * Author: Wanpeng Li <[email protected]>
+ * Date: Tue Feb 13 09:05:40 2018 +0800
+ *
+ * KVM: Introduce paravirtualization hints and KVM_HINTS_DEDICATED
+ *
+ * This patch introduces kvm_para_has_hint() to query for hints about
+ * the configuration of the guests. The first hint KVM_HINTS_DEDICATED,
+ * is set if the guest has dedicated physical CPUs for each vCPU (i.e.
+ * pinning and no over-commitment). This allows optimizing spinlocks
+ * and tells the guest to avoid PV TLB flush.
+ *
+ * Cc: Paolo Bonzini <[email protected]>
+ * Cc: Radim Krčmář <[email protected]>
+ * Cc: Eduardo Habkost <[email protected]>
+ * Signed-off-by: Wanpeng Li <[email protected]>
+ * Signed-off-by: Paolo Bonzini <[email protected]>
+ * Signed-off-by: Radim Krčmář <[email protected]>
+ *
+ * 名字改成了KVM_HINTS_REALTIME
+ *
+ * 在以下使用KVM_HINTS_REALTIME:
+ * - arch/x86/kernel/kvm.c|534| <<kvm_smp_prepare_cpus>> if (kvm_para_has_hint(KVM_HINTS_REALTIME))
+ * - arch/x86/kernel/kvm.c|627| <<kvm_guest_init>> !kvm_para_has_hint(KVM_HINTS_REALTIME) &&
+ * - arch/x86/kernel/kvm.c|640| <<kvm_guest_init>> !kvm_para_has_hint(KVM_HINTS_REALTIME) &&
+ * - arch/x86/kernel/kvm.c|744| <<kvm_setup_pv_tlb_flush>> !kvm_para_has_hint(KVM_HINTS_REALTIME) &&
+ * - arch/x86/kernel/kvm.c|839| <<kvm_spinlock_init>> if (kvm_para_has_hint(KVM_HINTS_REALTIME))
+ * - drivers/cpuidle/cpuidle-haltpoll.c|105| <<haltpoll_init>> !kvm_para_has_hint(KVM_HINTS_REALTIME))
+ */
#define KVM_HINTS_REALTIME 0
/* The last 8 bits are used to indicate how to interpret the flags field
@@ -79,6 +109,11 @@ struct kvm_clock_pairing {
#define KVM_MAX_MMU_OP_BATCH 32
#define KVM_ASYNC_PF_ENABLED (1 << 0)
+/*
+ * 在以下使用KVM_ASYNC_PF_SEND_ALWAYS:
+ * - arch/x86/kernel/kvm.c|316| <<kvm_guest_cpu_init>> pa |= KVM_ASYNC_PF_SEND_ALWAYS;
+ * - arch/x86/kvm/x86.c|2564| <<kvm_pv_enable_async_pf>> vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
+ */
#define KVM_ASYNC_PF_SEND_ALWAYS (1 << 1)
#define KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT (1 << 2)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 32ef1ee733b7..f74cbac85c01 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -35,6 +35,56 @@
#include <asm/tlb.h>
#include <asm/cpuidle_haltpoll.h>
+/*
+ * Documentation/virt/kvm/msr.txt
+ * kvm的pv选项:
+ * - apf
+ * - steal clock
+ * - pv-eoi
+ * - pv-ipi
+ */
+/*
+ * apf
+ *
+ * 1. 进程访问被swap出去的内存页从而触发page fault,kvm mmu尝试apf方式.
+ * 2. 将具体的处理逻辑交给apf worker,然后通过注入page not present的异常通知guest.
+ * 3. guest进入异常处理逻辑,将该进程block住,然后reschedule运行其他进程.
+ * 4. apf work完成page换进工作后通知guest,page页已经准备好.guest重新将block的进程调度进来,使该进程正常进行.
+ *
+ * https://terenceli.github.io/%E6%8A%80%E6%9C%AF/2019/03/24/kvm-async-page-fault
+ * https://www.linux-kvm.org/images/a/ac/2010-forum-Async-page-faults.pdf
+ * https://www.kernelnote.com/entry/kvmguestswap
+ */
+/*
+ * pv-eoi (减少vmexit的数量)
+ *
+ * x86 PC体系架构中的中断控制器,早先是8259A,现在更普遍使用的是APIC,他们处理中断的流程遵循如下流程:
+ *
+ * 1. 外部设备产生一个中断,如果该中断没有被屏蔽掉,中断控制器将IRR寄存器中相应的位置1,表示收到中断,但是还未提交给CPU处理.
+ * 2. 中断控制器将该中断提交给CPU,CPU收到中断请求后,会应答中断控制器.
+ * 3. 中断控制器收到CPU的中断应答后,将IRR寄存器中相应的位清0,并将ISR寄存器相应的位置1,表示CPU正在处理该中断.
+ * 4. 当该中断的处理程序结束以前,需要将中断控制器的EOI寄存器对应的位置1,表示CPU完成了对该中断的处理.
+ * 5. 中断控制器收到EOI后,ISR寄存器中相应的位清0,允许下次中断.
+ * 6. 在虚拟化场景中,该流程至少会导致两次VM Exit: 第一次是VMM截获到设备中断的时候,通知客户机退出,将这个中断注入到客户机中;
+ * 另外一次是当客户机操作系统处理完该中断后,写中断控制器的EOI寄存器,这是个MMIO操作,也会导致客户机退出.
+ * 在一个外部IO比较频繁的场景中,外部中断会导致大量的VM Exit,影响客户机的整体性能.
+ *
+ * PV-EOI其实就是通过半虚拟化的办法来优化上述的VM Exit影响,virtio也是使用这个思想来优化网络和磁盘;就EOI的优化来说,其思想本质上很简单:
+ *
+ * 1. 客户机和VMM协商,首先确定双方是否都能支持PV-EOI特性,如果成功,则进一步协商一块2 bytes的内存区间作为双方处理EOI的共享缓存;
+ * 2. 在VMM向客户机注入中断之前,会把缓存的最低位置1,表示客户机不需要通过写EOI寄存器;
+ * 3. 客户机在写EOI之前,如果发现该位被设置,则将该位清0;VMM轮询这个标志位,当检查到清0后,会更新模拟中断控制器中的EOI寄存器;
+ * 如果客户机发现该位未被设置,则继续使用MMIO或者MSR写EOI寄存器;
+ *
+ * 需要注意的是,为了保证客户机和VMM同时处理共享内存的性能和可靠性,目前KVM的PV-EOF方案采用了如下的优化措施:
+ *
+ * 1. VMM保障仅会在客户机VCPU的上下文中更改共享内存中的最低位,从而避免了客户机采用任何锁机制来与VMM进行同步;
+ * 2. 客户机必须使用原子的test_and_clear操作来更改共享内存中的最低位,这是因为VMM在任何时候都有可能设置或者清除该位;
+ *
+ * https://blog.csdn.net/luo_brian/article/details/8744025?utm_source=tuicool&utm_medium=referral
+ * https://fedoraproject.org/wiki/QA:Testcase_Virtualization_PV_EOI
+ */
+
static int kvmapf = 1;
static int __init parse_no_kvmapf(char *arg)
@@ -293,6 +343,22 @@ static void kvm_register_steal_time(void)
static DEFINE_PER_CPU_DECRYPTED(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;
+/*
+ * PV-EOI其实就是通过半虚拟化的办法来优化上述的VM Exit影响,virtio也是使用这个思想来优化网络和磁盘;就EOI的优化来说,其思想本质上很简单:
+ *
+ * 1. 客户机和VMM协商,首先确定双方是否都能支持PV-EOI特性,如果成功,则进一步协商一块2 bytes的内存区间作为双方处理EOI的共享缓存;
+ * 2. 在VMM向客户机注入中断之前,会把缓存的最低位置1,表示客户机不需要通过写EOI寄存器;
+ * 3. 客户机在写EOI之前,如果发现该位被设置,则将该位清0;VMM轮询这个标志位,当检查到清0后,会更新模拟中断控制器中的EOI寄存器;
+ * 如果客户机发现该位未被设置,则继续使用MMIO或者MSR写EOI寄存器;
+ *
+ * 需要注意的是,为了保证客户机和VMM同时处理共享内存的性能和可靠性,目前KVM的PV-EOF方案采用了如下的优化措施:
+ *
+ * 1. VMM保障仅会在客户机VCPU的上下文中更改共享内存中的最低位,从而避免了客户机采用任何锁机制来与VMM进行同步;
+ * 2. 客户机必须使用原子的test_and_clear操作来更改共享内存中的最低位,这是因为VMM在任何时候都有可能设置或者清除该位;
+ *
+ * https://blog.csdn.net/luo_brian/article/details/8744025?utm_source=tuicool&utm_medium=referral
+ * https://fedoraproject.org/wiki/QA:Testcase_Virtualization_PV_EOI
+ */
static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val)
{
/**
@@ -581,6 +647,31 @@ static void __init kvm_apf_trap_init(void)
static DEFINE_PER_CPU(cpumask_var_t, __pv_tlb_mask);
+/*
+ * commit 858a43aae23672d46fe802a41f4748f322965182
+ * Author: Wanpeng Li <[email protected]>
+ * Date: Tue Dec 12 17:33:02 2017 -0800
+ *
+ * KVM: X86: use paravirtualized TLB Shootdown
+ *
+ * Remote TLB flush does a busy wait which is fine in bare-metal
+ * scenario. But with-in the guest, the vcpus might have been pre-empted or
+ * blocked. In this scenario, the initator vcpu would end up busy-waiting
+ * for a long amount of time; it also consumes CPU unnecessarily to wake
+ * up the target of the shootdown.
+ *
+ * This patch set adds support for KVM's new paravirtualized TLB flush;
+ * remote TLB flush does not wait for vcpus that are sleeping, instead
+ * KVM will flush the TLB as soon as the vCPU starts running again.
+ *
+ * The improvement is clearly visible when the host is overcommitted; in this
+ * case, the PV TLB flush (in addition to avoiding the wait on the main CPU)
+ * prevents preempted vCPUs from stealing precious execution time from the
+ * running ones.
+ *
+ * Testing on a Xeon Gold 6142 2.6GHz 2 sockets, 32 cores, 64 threads,
+ * so 64 pCPUs, and each VM is 64 vCPUs.
+ */
static void kvm_flush_tlb_others(const struct cpumask *cpumask,
const struct flush_tlb_info *info)
{
@@ -736,6 +827,31 @@ static __init int activate_jump_labels(void)
}
arch_initcall(activate_jump_labels);
+/*
+ * commit 858a43aae23672d46fe802a41f4748f322965182
+ * Author: Wanpeng Li <[email protected]>
+ * Date: Tue Dec 12 17:33:02 2017 -0800
+ *
+ * KVM: X86: use paravirtualized TLB Shootdown
+ *
+ * Remote TLB flush does a busy wait which is fine in bare-metal
+ * scenario. But with-in the guest, the vcpus might have been pre-empted or
+ * blocked. In this scenario, the initator vcpu would end up busy-waiting
+ * for a long amount of time; it also consumes CPU unnecessarily to wake
+ * up the target of the shootdown.
+ *
+ * This patch set adds support for KVM's new paravirtualized TLB flush;
+ * remote TLB flush does not wait for vcpus that are sleeping, instead
+ * KVM will flush the TLB as soon as the vCPU starts running again.
+ *
+ * The improvement is clearly visible when the host is overcommitted; in this
+ * case, the PV TLB flush (in addition to avoiding the wait on the main CPU)
+ * prevents preempted vCPUs from stealing precious execution time from the
+ * running ones.
+ *
+ * Testing on a Xeon Gold 6142 2.6GHz 2 sockets, 32 cores, 64 threads,
+ * so 64 pCPUs, and each VM is 64 vCPUs.
+ */
static __init int kvm_setup_pv_tlb_flush(void)
{
int cpu;
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 904494b924c1..9359910bbb52 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -23,7 +23,87 @@
#include <asm/reboot.h>
#include <asm/kvmclock.h>
+/*
+ * FROM SUSE!!!
+ * When using kvm-clock, it is not recommended to use NTP in the VM Guest, as
+ * well. Using NTP on the VM Host Server, however, is still recommended.
+ */
+
+/*
+ * Clocksource is a device that can give a timestamp whenever you need it. In
+ * other words, Clocksource is any ticking counter that allows you to get its
+ * value.
+ *
+ * Clockevent device is an alarm clock—you ask the device to signal a time in
+ * the future (e.g., "wake me up in 1ms") and when the alarm is triggered, you
+ * get the signal.
+ *
+ * sched_clock() function is similar to clocksource, but this particular one
+ * should be "cheap" to read (meaning that one can get its value fast), as
+ * sched_clock() is used for task-scheduling purposes and scheduling happens
+ * often. We're ready to sacrifice accuracy and other characteristics for
+ * speed.
+ *
+ * > CLOCK_REALTIME clock gives the time passed since January 1, 1970. This
+ * clock is affected by NTP adjustments and can jump forward and backward when
+ * a system administrator adjusts system time.
+ *
+ * > CLOCK_MONOTONIC clock gives the time since a fixed starting point-usually
+ * since you booted the system. This clock is affected by NTP, but it can't
+ * jump backward.
+ *
+ * > CLOCK_MONOTONIC_RAW clock gives the same time as CLOCK_MONOTONIC, but this
+ * clock is not affected by NTP adjustments.
+ *
+ * > CLOCK_REALTIME_COARSE and CLOCK_MONOTONIC_COARSE are faster but
+ * less-accurate variants of CLOCK_REALTIME and CLOCK_MONOTONIC.
+ *
+ * Hardware extensions for virtualizing TSC
+ *
+ * Since the early days of hardware-assisted virtualization, Intel was
+ * supplying an option to do TSC offsetting for virtual guests in hardware,
+ * which would mean that a guest's rdtsc reading will return a host's TSC value
+ * + offset. Unfortunately, this wasn't enough to support migration between
+ * different hosts because TSC frequency may differ, so pvclock and TSC page
+ * protocol were introduced. In late 2015, Intel introduced the TSC scaling
+ * feature (which was already present in AMD processors for several years) and,
+ * in theory, this is a game changer making pvclock and TSC page protocols
+ * redundant. However, an immediate switch to using plain TSC as a clocksource
+ * for virtualized guests seems impractical; one must be sure that all
+ * potential migration recipient hosts support the feature, but it is not yet
+ * widely available. Extensive testing also must be performed to make sure
+ * there are no drawbacks to switching from paravirtualized protocols.
+ */
+
+/*
+ * To get the current TSC reading, guests must do the following math:
+ *
+ * PerCPUTime = ((RDTSC() - tsc_timestamp) >> tsc_shift) * tsc_to_system_mul + system_time
+ *
+ *
+ *
+ * kvmclock or KVM pvclock lets guests read the host's wall clock time. It's
+ * really very simple: the guest sets aside a page of its RAM and asks the host
+ * to write time into that page (using an MSR). The host writes a structure
+ * containing the current time to this page - in theory the host updates this
+ * page constantly, but in reality that would be wasteful and the structure is
+ * only updated just before reentering the guest after some VM event.
+ * host更新clock的时间的函数在kvm_guest_time_update(), 只被vcpu_enter_guest()调用
+ */
+
static int kvmclock __initdata = 1;
+/*
+ * commit 3dc4f7cfb7441e5e0fed3a02fc81cdaabd28300a
+ * Author: Marcelo Tosatti <[email protected]>
+ * Date: Tue Nov 27 23:28:56 2012 -0200
+ *
+ * x86: kvm guest: pvclock vsyscall support
+ *
+ * Hook into generic pvclock vsyscall code, with the aim to
+ * allow userspace to have visibility into pvclock data.
+ *
+ * Signed-off-by: Marcelo Tosatti <[email protected]>
+ */
static int kvmclock_vsyscall __initdata = 1;
static int msr_kvm_system_time __ro_after_init = MSR_KVM_SYSTEM_TIME;
static int msr_kvm_wall_clock __ro_after_init = MSR_KVM_WALL_CLOCK;
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index cf55629ff0ff..af7e5c07999f 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -994,6 +994,13 @@ static bool cpuid_function_in_range(struct kvm_vcpu *vcpu, u32 function)
return max && function <= max->eax;
}
+/*
+ * called by:
+ * - arch/x86/kvm/cpuid.c|1063| <<kvm_emulate_cpuid>> kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, true);
+ * - arch/x86/kvm/svm.c|2122| <<svm_vcpu_reset>> kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, true);
+ * - arch/x86/kvm/trace.h|153| <<__field>> TRACE_EVENT(kvm_cpuid,
+ * - arch/x86/kvm/x86.c|6347| <<emulator_get_cpuid>> return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, check_limit);
+ */
bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
u32 *ecx, u32 *edx, bool check_limit)
{
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index d78a61408243..248931e1891b 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -132,6 +132,10 @@ static inline int guest_cpuid_family(struct kvm_vcpu *vcpu)
return x86_family(best->eax);
}
+/*
+ * called by:
+ * - arch/x86/kvm/svm.c|4213| <<svm_get_msr>> model = guest_cpuid_model(vcpu);
+ */
static inline int guest_cpuid_model(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 23ff65504d7e..601d5170de86 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -199,6 +199,20 @@ static void synic_exit(struct kvm_vcpu_hv_synic *synic, u32 msr)
kvm_make_request(KVM_REQ_HV_EXIT, vcpu);
}
+/*
+ * [0] kvm_hv_set_msr_common
+ * [0] kvm_set_msr_common
+ * [0] vmx_set_msr
+ * [0] __kvm_set_msr
+ * [0] msr_io
+ * [0] kvm_arch_vcpu_ioctl
+ * [0] kvm_vcpu_ioctl
+ * [0] do_vfs_ioctl
+ * [0] ksys_ioctl
+ * [0] __x64_sys_ioctl
+ * [0] do_syscall_64
+ * [0] entry_SYSCALL_64_after_hwframe
+ */
static int synic_set_msr(struct kvm_vcpu_hv_synic *synic,
u32 msr, u64 data, bool host)
{
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index cf9177b4a07f..8c38c9ff1019 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -96,6 +96,15 @@ static inline int __apic_test_and_clear_vector(int vec, void *bitmap)
return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
}
+/*
+ * 在以下使用apic_hw_disabled:
+ * - arch/x86/kvm/lapic.c|2098| <<kvm_free_lapic>> static_key_slow_dec_deferred(&apic_hw_disabled);
+ * - arch/x86/kvm/lapic.c|2175| <<kvm_lapic_set_base>> static_key_slow_dec_deferred(&apic_hw_disabled);
+ * - arch/x86/kvm/lapic.c|2177| <<kvm_lapic_set_base>> static_key_slow_inc(&apic_hw_disabled.key);
+ * - arch/x86/kvm/lapic.c|2795| <<kvm_lapic_init>> jump_label_rate_limit(&apic_hw_disabled, HZ);
+ * - arch/x86/kvm/lapic.c|2801| <<kvm_lapic_exit>> static_key_deferred_flush(&apic_hw_disabled);
+ * - arch/x86/kvm/lapic.h|175| <<kvm_apic_hw_enabled>> if (static_key_false(&apic_hw_disabled.key))
+ */
struct static_key_deferred apic_hw_disabled __read_mostly;
struct static_key_deferred apic_sw_disabled __read_mostly;
@@ -425,6 +434,14 @@ static inline int apic_search_irr(struct kvm_lapic *apic)
return find_highest_vector(apic->regs + APIC_IRR);
}
+/*
+ * called by:
+ * - arch/x86/kvm/lapic.c|455| <<apic_clear_irr>> apic_find_highest_irr(apic));
+ * - arch/x86/kvm/lapic.c|543| <<kvm_lapic_find_highest_irr>> return apic_find_highest_irr(vcpu->arch.apic);
+ * - arch/x86/kvm/lapic.c|665| <<apic_has_interrupt_for_ppr>> highest_irr = apic_find_highest_irr(apic);
+ * - arch/x86/kvm/lapic.c|2496| <<kvm_apic_set_state>> apic_find_highest_irr(apic));
+ * - arch/x86/kvm/lapic.c|2614| <<kvm_lapic_sync_to_vapic>> max_irr = apic_find_highest_irr(apic);
+ */
static inline int apic_find_highest_irr(struct kvm_lapic *apic)
{
int result;
@@ -2144,6 +2161,13 @@ u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
return (tpr & 0xf0) >> 4;
}
+/*
+ * called by:
+ * - arch/x86/kvm/lapic.c|2208| <<kvm_lapic_reset>> kvm_lapic_set_base(vcpu, APIC_DEFAULT_PHYS_BASE |
+ * - arch/x86/kvm/lapic.c|2244| <<kvm_lapic_reset>> kvm_lapic_set_base(vcpu,
+ * - arch/x86/kvm/lapic.c|2485| <<kvm_apic_set_state>> kvm_lapic_set_base(vcpu, vcpu->arch.apic_base);
+ * - arch/x86/kvm/x86.c|349| <<kvm_set_apic_base>> kvm_lapic_set_base(vcpu, msr_info->data);
+ */
void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
{
u64 old_value = vcpu->arch.apic_base;
@@ -2268,6 +2292,12 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu)
return 0;
}
+/*
+ * called by:
+ * - arch/x86/kvm/lapic.c|1576| <<kvm_apic_inject_pending_timer_irqs>> kvm_apic_local_deliver(apic, APIC_LVTT);
+ * - arch/x86/kvm/lapic.c|2291| <<kvm_apic_nmi_wd_deliver>> kvm_apic_local_deliver(apic, APIC_LVT0);
+ * - arch/x86/kvm/pmu.c|382| <<kvm_pmu_deliver_pmi>> kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC);
+ */
int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type)
{
u32 reg = kvm_lapic_get_reg(apic, lvt_type);
@@ -2599,6 +2629,10 @@ static void apic_sync_pv_eoi_to_guest(struct kvm_vcpu *vcpu,
pv_eoi_set_pending(apic->vcpu);
}
+/*
+ * called by:
+ * - arch/x86/kvm/x86.c|8145| <<vcpu_enter_guest>> kvm_lapic_sync_to_vapic(vcpu);
+ */
void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
{
u32 data, tpr;
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 39925afdfcdc..2c6ab1d4ac5f 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -159,19 +159,62 @@ static inline void kvm_lapic_set_reg(struct kvm_lapic *apic, int reg_off, u32 va
*((u32 *) (apic->regs + reg_off)) = val;
}
+/*
+ * 在以下使用kvm_no_apic_vcpu:
+ * - arch/x86/kvm/lapic.h|166| <<lapic_in_kernel>> if (static_key_false(&kvm_no_apic_vcpu))
+ * - arch/x86/kvm/x86.c|9438| <<kvm_arch_vcpu_init>> static_key_slow_inc(&kvm_no_apic_vcpu);
+ * - arch/x86/kvm/x86.c|9497| <<kvm_arch_vcpu_uninit>> static_key_slow_dec(&kvm_no_apic_vcpu);
+ */
extern struct static_key kvm_no_apic_vcpu;
static inline bool lapic_in_kernel(struct kvm_vcpu *vcpu)
{
+ /*
+ * 9419 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
+ * ... ...
+ * 9443 if (irqchip_in_kernel(vcpu->kvm)) {
+ * 9444 vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu->kvm);
+ * 9445 r = kvm_create_lapic(vcpu, lapic_timer_advance_ns);
+ * 9446 if (r < 0)
+ * 9447 goto fail_mmu_destroy;
+ * 9448 } else
+ * 9449 static_key_slow_inc(&kvm_no_apic_vcpu);
+ *
+ * 在以下使用kvm_no_apic_vcpu:
+ * - arch/x86/kvm/lapic.h|166| <<lapic_in_kernel>> if (static_key_false(&kvm_no_apic_vcpu))
+ * - arch/x86/kvm/x86.c|9438| <<kvm_arch_vcpu_init>> static_key_slow_inc(&kvm_no_apic_vcpu);
+ * - arch/x86/kvm/x86.c|9497| <<kvm_arch_vcpu_uninit>> static_key_slow_dec(&kvm_no_apic_vcpu);
+ *
+ * 如果kvm_no_apic_vcpu是false, 说明kvm_arch_vcpu_init()->kvm_create_lapic()被调用了
+ * 初始化了vcpu->arch.apic
+ */
if (static_key_false(&kvm_no_apic_vcpu))
return vcpu->arch.apic;
return true;
}
+/*
+ * 在以下使用apic_hw_disabled:
+ * - arch/x86/kvm/lapic.c|2098| <<kvm_free_lapic>> static_key_slow_dec_deferred(&apic_hw_disabled);
+ * - arch/x86/kvm/lapic.c|2175| <<kvm_lapic_set_base>> static_key_slow_dec_deferred(&apic_hw_disabled);
+ * - arch/x86/kvm/lapic.c|2177| <<kvm_lapic_set_base>> static_key_slow_inc(&apic_hw_disabled.key);
+ * - arch/x86/kvm/lapic.c|2795| <<kvm_lapic_init>> jump_label_rate_limit(&apic_hw_disabled, HZ);
+ * - arch/x86/kvm/lapic.c|2801| <<kvm_lapic_exit>> static_key_deferred_flush(&apic_hw_disabled);
+ * - arch/x86/kvm/lapic.h|175| <<kvm_apic_hw_enabled>> if (static_key_false(&apic_hw_disabled.key))
+ */
extern struct static_key_deferred apic_hw_disabled;
static inline int kvm_apic_hw_enabled(struct kvm_lapic *apic)
{
+ /*
+ * 在以下使用apic_hw_disabled:
+ * - arch/x86/kvm/lapic.c|2098| <<kvm_free_lapic>> static_key_slow_dec_deferred(&apic_hw_disabled);
+ * - arch/x86/kvm/lapic.c|2175| <<kvm_lapic_set_base>> static_key_slow_dec_deferred(&apic_hw_disabled);
+ * - arch/x86/kvm/lapic.c|2177| <<kvm_lapic_set_base>> static_key_slow_inc(&apic_hw_disabled.key);
+ * - arch/x86/kvm/lapic.c|2795| <<kvm_lapic_init>> jump_label_rate_limit(&apic_hw_disabled, HZ);
+ * - arch/x86/kvm/lapic.c|2801| <<kvm_lapic_exit>> static_key_deferred_flush(&apic_hw_disabled);
+ * - arch/x86/kvm/lapic.h|175| <<kvm_apic_hw_enabled>> if (static_key_false(&apic_hw_disabled.key))
+ */
if (static_key_false(&apic_hw_disabled.key))
return apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
return MSR_IA32_APICBASE_ENABLE;
@@ -186,8 +229,29 @@ static inline bool kvm_apic_sw_enabled(struct kvm_lapic *apic)
return true;
}
+/*
+ * called by:
+ * - arch/x86/kvm/irq_comm.c|67| <<kvm_irq_delivery_to_apic>> if (!kvm_apic_present(vcpu))
+ * - arch/x86/kvm/irq_comm.c|334| <<kvm_intr_is_single_vcpu>> if (!kvm_apic_present(vcpu))
+ * - arch/x86/kvm/lapic.c|189| <<recalculate_apic_map>> if (kvm_apic_present(vcpu))
+ * - arch/x86/kvm/lapic.c|209| <<recalculate_apic_map>> if (!kvm_apic_present(vcpu))
+ * - arch/x86/kvm/lapic.c|1168| <<kvm_bitmap_or_dest_vcpus>> if (!kvm_apic_present(vcpu))
+ * - arch/x86/kvm/lapic.h|217| <<kvm_lapic_enabled>> return kvm_apic_present(vcpu) && kvm_apic_sw_enabled(vcpu->arch.apic);
+ * - arch/x86/kvm/x86.c|7912| <<vcpu_scan_ioapic>> if (!kvm_apic_present(vcpu))
+ */
static inline bool kvm_apic_present(struct kvm_vcpu *vcpu)
{
+ /*
+ * 9419 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
+ * ... ...
+ * 9443 if (irqchip_in_kernel(vcpu->kvm)) {
+ * 9444 vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu->kvm);
+ * 9445 r = kvm_create_lapic(vcpu, lapic_timer_advance_ns);
+ * 9446 if (r < 0)
+ * 9447 goto fail_mmu_destroy;
+ * 9448 } else
+ * 9449 static_key_slow_inc(&kvm_no_apic_vcpu);
+ */
return lapic_in_kernel(vcpu) && kvm_apic_hw_enabled(vcpu->arch.apic);
}
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index d55674f44a18..51db11a6e926 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -73,6 +73,12 @@ static inline unsigned long kvm_mmu_available_pages(struct kvm *kvm)
return 0;
}
+/*
+ * called by:
+ * - arch/x86/kvm/vmx/nested.c|5234| <<nested_vmx_eptp_switching>> kvm_mmu_reload(vcpu);
+ * - arch/x86/kvm/x86.c|8142| <<vcpu_enter_guest>> r = kvm_mmu_reload(vcpu);
+ * - arch/x86/kvm/x86.c|10009| <<kvm_arch_async_page_ready>> r = kvm_mmu_reload(vcpu);
+ */
static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
{
if (likely(vcpu->arch.mmu->root_hpa != INVALID_PAGE))
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 6f92b40d798c..45d86f27bb08 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4183,6 +4183,15 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
}
+/*
+ * 1. 根据gfn找到对应的memslot
+ * 2. 用memslot的起始hva(userspace_addr)+(gfn-slot中的起始gfn(base_gfn))*页大小(PAGE_SIZE)
+ * 得到gfn对应的起始hva
+ * 3. 为该hva分配一个物理页,有hva_to_pfn_fast()和hva_to_pfn_slow()两种,hva_to_pfn_fast()
+ * 实际上是调用__get_user_pages_fast(),会尝试去pin该page,即确保该地址所在的物理页在内存中.
+ * 如果失败,退化到hva_to_pfn_slow(),会先去拿mm->mmap_sem的锁然后调用__get_user_pages()来pin
+ * 4. 如果分配成功,对其返回的struct page调用page_to_pfn()得到对应的pfn
+ */
static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable)
{
@@ -5257,6 +5266,11 @@ void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
+/*
+ * called by:
+ * - arch/x86/kvm/mmu.h|81| <<kvm_mmu_reload>> return kvm_mmu_load(vcpu);
+ * - arch/x86/kvm/svm.c|3442| <<nested_svm_vmexit>> kvm_mmu_load(&svm->vcpu);
+ */
int kvm_mmu_load(struct kvm_vcpu *vcpu)
{
int r;
@@ -5516,6 +5530,30 @@ static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
return 0;
}
+/*
+ * kvm mmio处理:
+ *
+ * 开始的时候mmio的内存都没有kvm_memory_slot, pte也都不存在,所以访问的时候会ept violation.
+ * handle_ept_violation()会调用kvm_mmu_page_fault().因为handle_ept_violation()不会为
+ * error_code设置PFERR_RSVD_MASK,所以kvm_mmu_page_fault()调用tdp_page_fault().
+ *
+ * tdp_page_fault()-->try_async_pf()-->__gfn_to_pfn_memslot()-->__gfn_to_hva_many().
+ * __gfn_to_hva_many()返回KVM_HVA_ERR_BAD=PAGE_OFFSET,所以__gfn_to_pfn_memslot()返回
+ * KVM_PFN_NOSLOT.
+ *
+ * 然后tdp_page_fault()-->__direct_map(),通过mmu_set_spte()-->set_mmio_spte()设置对应
+ * 的pte为mmio类型,也就是:
+ *
+ * 1. SPTE_SPECIAL_MASK(1往左移62位)是1
+ * 2. 结尾是110
+ * 3. 还有gen的信息
+ *
+ * 最后进行RET_PF_EMULATE
+ *
+ * 等到下一次page fault的时候,mmio的内存会触发handle_ept_misconfig()-->kvm_mmu_page_fault().
+ * 因为这次handle_ept_misconfig()设置error_code=PFERR_RSVD_MASK,kvm_mmu_page_fault()会调
+ * 用handle_mmio_page_fault(),返回进行emulate.
+ */
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
void *insn, int insn_len)
{
diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h
index 481ad879197b..667b168190ca 100644
--- a/arch/x86/kvm/vmx/vmcs.h
+++ b/arch/x86/kvm/vmx/vmcs.h
@@ -59,6 +59,15 @@ struct loaded_vmcs {
struct vmcs *vmcs;
struct vmcs *shadow_vmcs;
int cpu;
+ /*
+ * 在以下使用launched:
+ * - arch/x86/kvm/vmx/nested.c|3008| <<nested_vmx_check_vmentry_hw>> [launched]"i"(offsetof(struct loaded_vmcs, launched)),
+ * - arch/x86/kvm/vmx/vmx.c|660| <<loaded_vmcs_init>> if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
+ * - arch/x86/kvm/vmx/vmx.c|663| <<loaded_vmcs_init>> loaded_vmcs->launched = 0;
+ * - arch/x86/kvm/vmx/vmx.c|6596| <<vmx_vcpu_run>> vmx->loaded_vmcs->launched);
+ * - arch/x86/kvm/vmx/vmx.c|6678| <<vmx_vcpu_run>> vmx->loaded_vmcs->launched = 1;
+ * - 在arch/x86/kvm/vmx/vmenter.S中也用到, 标志是应该vmresume还是vmlaunch
+ */
bool launched;
bool nmi_known_unmasked;
bool hv_timer_soft_disabled;
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index 81ada2ce99e7..db945eb46778 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -7,6 +7,28 @@
#define WORD_SIZE (BITS_PER_LONG / 8)
+/*
+ * #define __VCPU_REGS_RAX 0
+ * #define __VCPU_REGS_RCX 1
+ * #define __VCPU_REGS_RDX 2
+ * #define __VCPU_REGS_RBX 3
+ * #define __VCPU_REGS_RSP 4
+ * #define __VCPU_REGS_RBP 5
+ * #define __VCPU_REGS_RSI 6
+ * #define __VCPU_REGS_RDI 7
+ *
+ * #ifdef CONFIG_X86_64
+ * #define __VCPU_REGS_R8 8
+ * #define __VCPU_REGS_R9 9
+ * #define __VCPU_REGS_R10 10
+ * #define __VCPU_REGS_R11 11
+ * #define __VCPU_REGS_R12 12
+ * #define __VCPU_REGS_R13 13
+ * #define __VCPU_REGS_R14 14
+ * #define __VCPU_REGS_R15 15
+ * #endif
+ */
+
#define VCPU_RAX __VCPU_REGS_RAX * WORD_SIZE
#define VCPU_RCX __VCPU_REGS_RCX * WORD_SIZE
#define VCPU_RDX __VCPU_REGS_RDX * WORD_SIZE
@@ -44,6 +66,15 @@
* to vmx_vmexit.
*/
SYM_FUNC_START(vmx_vmenter)
+ /*
+ * 在以下使用launched:
+ * - arch/x86/kvm/vmx/nested.c|3008| <<nested_vmx_check_vmentry_hw>> [launched]"i"(offsetof(struct loaded_vmcs, launched)),
+ * - arch/x86/kvm/vmx/vmx.c|660| <<loaded_vmcs_init>> if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
+ * - arch/x86/kvm/vmx/vmx.c|663| <<loaded_vmcs_init>> loaded_vmcs->launched = 0;
+ * - arch/x86/kvm/vmx/vmx.c|6596| <<vmx_vcpu_run>> vmx->loaded_vmcs->launched);
+ * - arch/x86/kvm/vmx/vmx.c|6678| <<vmx_vcpu_run>> vmx->loaded_vmcs->launched = 1;
+ * - 在arch/x86/kvm/vmx/vmenter.S中也用到, 标志是应该vmresume还是vmlaunch
+ */
/* EFLAGS.ZF is set if VMCS.LAUNCHED == 0 */
je 2f
@@ -62,6 +93,18 @@ SYM_FUNC_START(vmx_vmenter)
5: jmp 3b
.popsection
+ /*
+ * https://lwn.net/Articles/531148/
+ *
+ * The _ASM_EXTABLE(addr1, addr2) macro allows the page fault exception
+ * handler to determine whether an exception was caused by a kernel
+ * instruction at address addr1 while trying to read or write a byte
+ * into a process address space. If so, the kernel jumps to addr2 that
+ * contains the fixup code, otherwise a kernel oops occurs. The
+ * delimiters of the __ex_table special section (see the previous
+ * linker script example) set the range of critical kernel instructions
+ * that transfer bytes from or to user space.
+ */
_ASM_EXTABLE(1b, 5b)
_ASM_EXTABLE(2b, 5b)
@@ -77,6 +120,10 @@ SYM_FUNC_END(vmx_vmenter)
* here after hardware loads the host's state, i.e. this is the destination
* referred to by VMCS.HOST_RIP.
*/
+/*
+ * 在以下使用vmx_vmexit:
+ * - arch/x86/kvm/vmx/vmx.c|3923| <<vmx_set_constant_host_state>> vmcs_writel(HOST_RIP, (unsigned long )vmx_vmexit);
+ */
SYM_FUNC_START(vmx_vmexit)
#ifdef CONFIG_RETPOLINE
ALTERNATIVE "jmp .Lvmexit_skip_rsb", "", X86_FEATURE_RETPOLINE
@@ -101,6 +148,19 @@ SYM_FUNC_END(vmx_vmexit)
* Returns:
* 0 on VM-Exit, 1 on VM-Fail
*/
+/*
+ * bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched);
+ *
+ * vmx_vcpu_run()调用:
+ * 6595 vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
+ * 6596 vmx->loaded_vmcs->launched);
+ *
+ * calling convention: RDI, RSI, RDX, RCX, R8, R9, [XYZ]MM0–7
+ *
+ * rdi: struct vcpu_vmx *vmx
+ * rsi: unsigned long *regs
+ * rdx: bool launched
+ */
SYM_FUNC_START(__vmx_vcpu_run)
push %_ASM_BP
mov %_ASM_SP, %_ASM_BP
@@ -119,18 +179,46 @@ SYM_FUNC_START(__vmx_vcpu_run)
* Save @regs, _ASM_ARG2 may be modified by vmx_update_host_rsp() and
* @regs is needed after VM-Exit to save the guest's register values.
*/
+ /* x86_64下的定义是_ASM_SI, 第二个参数(unsigned long *)&vcpu->arch.regs */
push %_ASM_ARG2
/* Copy @launched to BL, _ASM_ARG3 is volatile. */
+ /* x86_64下的定义是dl, 第三个参数vmx->loaded_vmcs->launched */
mov %_ASM_ARG3B, %bl
/* Adjust RSP to account for the CALL to vmx_vmenter(). */
+ /*
+ * %_ASM_ARG2在x86_64下的定义是_ASM_SI,
+ * 这里相当于把host的rsp地址(其实不包括(unsigned long *)&vcpu->arch.regs)放入_ASM_SI
+ * 作为vmx_update_host_rsp()的参数"unsigned long host_rsp"
+ */
lea -WORD_SIZE(%_ASM_SP), %_ASM_ARG2
+ /*
+ * 6486 void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
+ * 6487 {
+ * 6488 if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) {
+ * 6489 vmx->loaded_vmcs->host_state.rsp = host_rsp;
+ * 6490 vmcs_writel(HOST_RSP, host_rsp);
+ * 6491 }
+ * 6492 }
+ */
call vmx_update_host_rsp
/* Load @regs to RAX. */
+ /*
+ * 和上面的"lea -WORD_SIZE(%_ASM_SP), %_ASM_ARG2"对应
+ */
mov (%_ASM_SP), %_ASM_AX
+ /*
+ * 在以下使用launched:
+ * - arch/x86/kvm/vmx/nested.c|3008| <<nested_vmx_check_vmentry_hw>> [launched]"i"(offsetof(struct loaded_vmcs, launched)),
+ * - arch/x86/kvm/vmx/vmx.c|660| <<loaded_vmcs_init>> if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
+ * - arch/x86/kvm/vmx/vmx.c|663| <<loaded_vmcs_init>> loaded_vmcs->launched = 0;
+ * - arch/x86/kvm/vmx/vmx.c|6596| <<vmx_vcpu_run>> vmx->loaded_vmcs->launched);
+ * - arch/x86/kvm/vmx/vmx.c|6678| <<vmx_vcpu_run>> vmx->loaded_vmcs->launched = 1;
+ * - 在arch/x86/kvm/vmx/vmenter.S中也用到, 标志是应该vmresume还是vmlaunch
+ */
/* Check if vmlaunch or vmresume is needed */
cmpb $0, %bl
@@ -154,6 +242,20 @@ SYM_FUNC_START(__vmx_vcpu_run)
/* Load guest RAX. This kills the @regs pointer! */
mov VCPU_RAX(%_ASM_AX), %_ASM_AX
+ /*
+ * vmx_vmenter - VM-Enter the current loaded VMCS
+ *
+ * %RFLAGS.ZF: !VMCS.LAUNCHED, i.e. controls VMLAUNCH vs. VMRESUME
+ *
+ * Returns:
+ * %RFLAGS.CF is set on VM-Fail Invalid
+ * %RFLAGS.ZF is set on VM-Fail Valid
+ * %RFLAGS.{CF,ZF} are cleared on VM-Success, i.e. VM-Exit
+ *
+ * Note that VMRESUME/VMLAUNCH fall-through and return directly if
+ * they VM-Fail, whereas a successful VM-Enter + VM-Exit will jump
+ * to vmx_vmexit.
+ */
/* Enter guest mode */
call vmx_vmenter
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index e3394c839dea..75ccdef59989 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1085,6 +1085,10 @@ static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range)
}
}
+/*
+ * called by:
+ * - arch/x86/kvm/vmx/vmx.c|6547| <<vmx_vcpu_run>> pt_guest_enter(vmx);
+ */
static void pt_guest_enter(struct vcpu_vmx *vmx)
{
if (pt_mode == PT_MODE_SYSTEM)
@@ -2622,6 +2626,12 @@ static void free_kvm_area(void)
}
}
+/*
+ * called by:
+ * - arch/x86/kvm/vmx/vmx.c|7747| <<hardware_setup>> r = alloc_kvm_area();
+ *
+ * 分配vmxon_region
+ */
static __init int alloc_kvm_area(void)
{
int cpu;
@@ -5062,6 +5072,9 @@ static int handle_apic_write(struct kvm_vcpu *vcpu)
return 1;
}
+/*
+ * kvm_vmx_exit_handlers[EXIT_REASON_TASK_SWITCH] = handle_task_switch()
+ */
static int handle_task_switch(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -5119,6 +5132,13 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
reason, has_error_code, error_code);
}
+/*
+ * 当Guest第一次访问某个页面时,由于没有GVA到GPA的映射,触发Guest OS的page fault.
+ * 于是Guest OS会建立对应的pte并修复好各级页表,最后访问对应的GPA.由于没有建立
+ * GPA到HVA的映射,于是触发EPT Violation,VMEXIT到KVM. KVM在vmx_handle_exit中执
+ * 行kvm_vmx_exit_handlers[exit_reason],发现exit_reason是
+ * EXIT_REASON_EPT_VIOLATION,因此调用 handle_ept_violation.
+ */
static int handle_ept_violation(struct kvm_vcpu *vcpu)
{
unsigned long exit_qualification;
@@ -5191,6 +5211,10 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu)
return 1;
}
+/*
+ * called by:
+ * - arch/x86/kvm/vmx/vmx.c|5845| <<vmx_handle_exit>> return handle_invalid_guest_state(vcpu);
+ */
static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -6469,6 +6493,9 @@ void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched);
+/*
+ * struct kvm_x86_ops vmx_x86_ops.run = vmx_vcpu_run()
+ */
static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -6553,6 +6580,18 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
if (vcpu->arch.cr2 != read_cr2())
write_cr2(vcpu->arch.cr2);
+ /*
+ * arch/x86/kvm/vmx/vmenter.S:
+ * __vmx_vcpu_run - Run a vCPU via a transition to VMX guest mode
+ * @vmx: struct vcpu_vmx * (forwarded to vmx_update_host_rsp)
+ * @regs: unsigned long * (to guest registers)
+ * @launched: %true if the VMCS has been launched
+ *
+ * Returns:
+ * 0 on VM-Exit, 1 on VM-Fail
+ *
+ * 只在这里调用__vmx_vcpu_run()
+ */
vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
vmx->loaded_vmcs->launched);
@@ -6672,6 +6711,10 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
kmem_cache_free(kvm_vcpu_cache, vmx);
}
+/*
+ * x86下的调用:
+ * - arch/x86/kvm/x86.c|9132| <<kvm_arch_vcpu_create>> vcpu = kvm_x86_ops->vcpu_create(kvm, id);
+ */
static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
{
int err;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index cf917139de6b..a28ee5c341a8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -74,8 +74,21 @@
#define CREATE_TRACE_POINTS
#include "trace.h"
+/*
+ * 在以下使用MAX_IO_MSRS:
+ * - arch/x86/kvm/x86.c|3188| <<msr_io>> if (msrs.nmsrs >= MAX_IO_MSRS)
+ *
+ * KVM_SET_MSRS和KVM_GET_MSRS一次从用户空间可以来的最多的
+ */
#define MAX_IO_MSRS 256
#define KVM_MAX_MCE_BANKS 32
+/*
+ * 在以下使用kvm_mce_cap_supported:
+ * - arch/x86/kvm/vmx/vmx.c|7748| <<hardware_setup>> kvm_mce_cap_supported |= MCG_LMCE_P;
+ * - arch/x86/kvm/x86.c|3410| <<kvm_arch_dev_ioctl>> if (copy_to_user(argp, &kvm_mce_cap_supported,
+ * - arch/x86/kvm/x86.c|3411| <<kvm_arch_dev_ioctl>> sizeof(kvm_mce_cap_supported)))
+ * - arch/x86/kvm/x86.c|3660| <<kvm_vcpu_ioctl_x86_setup_mce>> if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000))
+ */
u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P;
EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
@@ -87,6 +100,12 @@ EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
* - enable LME and LMA per default on 64 bit KVM
*/
#ifdef CONFIG_X86_64
+/*
+ * 在以下使用efer_reserved_bits:
+ * - arch/x86/kvm/x86.c|1378| <<kvm_valid_efer>> if (efer & efer_reserved_bits)