-
Notifications
You must be signed in to change notification settings - Fork 4
/
intel-iommu-irq-vfio-and-pci-comment-for-4.17.14.patch
3341 lines (3082 loc) · 111 KB
/
intel-iommu-irq-vfio-and-pci-comment-for-4.17.14.patch
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
From 11c52ae75a96475413b190d1de7720342c426dc9 Mon Sep 17 00:00:00 2001
From: Dongli Zhang <[email protected]>
Date: Mon, 8 Apr 2019 10:38:21 +0800
Subject: [PATCH 1/1] intel iommu, irq, vfio and pci for 4.17.14
Signed-off-by: Dongli Zhang <[email protected]>
---
arch/x86/kernel/apic/msi.c | 13 +
arch/x86/kernel/apic/vector.c | 39 +++
arch/x86/kernel/tboot.c | 1 +
drivers/iommu/dmar.c | 89 +++++
drivers/iommu/intel-iommu.c | 669 +++++++++++++++++++++++++++++++++++-
drivers/iommu/intel_irq_remapping.c | 132 ++++++-
drivers/iommu/iommu.c | 83 +++++
drivers/iommu/iova.c | 18 +-
drivers/iommu/irq_remapping.c | 1 +
drivers/pci/irq.c | 3 +
drivers/pci/msi.c | 314 ++++++++++++++++-
drivers/pci/pci-driver.c | 9 +
drivers/pci/probe.c | 18 +
include/linux/dmar.h | 1 +
include/linux/iommu.h | 8 +
include/linux/msi.h | 3 +
include/linux/pci.h | 3 +
kernel/irq/chip.c | 6 +
kernel/irq/irqdesc.c | 3 +
kernel/irq/irqdomain.c | 6 +
kernel/irq/matrix.c | 5 +
kernel/irq/msi.c | 25 ++
lib/dma-direct.c | 3 +
23 files changed, 1428 insertions(+), 24 deletions(-)
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index ce503c9..92a3e10 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -25,8 +25,18 @@
static struct irq_domain *msi_default_domain;
+/*
+ * called by only:
+ * - kernel/irq/chip.c|1407| <<irq_chip_compose_msi_msg>> pos->chip->irq_compose_msi_msg(pos, msg);
+ *
+ * 编辑msi msg的地方
+ */
static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
{
+ /*
+ * 先通过apic_chip_data(irqd)获取类型struct apic_chip_data
+ * 然后返回apic_chip_data->hw_irq_cfg
+ */
struct irq_cfg *cfg = irqd_cfg(data);
msg->address_hi = MSI_ADDR_BASE_HI;
@@ -42,6 +52,9 @@ static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
MSI_ADDR_REDIRECTION_CPU |
MSI_ADDR_DEST_ID(cfg->dest_apicid);
+ /*
+ * 下面cfg->vector应该是cpu的vector, 不是设备的vector
+ */
msg->data =
MSI_DATA_TRIGGER_EDGE |
MSI_DATA_LEVEL_ASSERT |
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index b708f59..4ffe103 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -87,6 +87,10 @@ static struct apic_chip_data *apic_chip_data(struct irq_data *irqd)
return irqd->chip_data;
}
+/*
+ * 先通过apic_chip_data(irqd)获取类型struct apic_chip_data
+ * 然后返回apic_chip_data->hw_irq_cfg
+ */
struct irq_cfg *irqd_cfg(struct irq_data *irqd)
{
struct apic_chip_data *apicd = apic_chip_data(irqd);
@@ -115,6 +119,30 @@ static void free_apic_chip_data(struct apic_chip_data *apicd)
kfree(apicd);
}
+/*
+ * [0] apic_update_irq_cfg
+ * [0] assign_managed_vector.isra.16
+ * [0] x86_vector_activate
+ * [0] __irq_domain_activate_irq
+ * [0] __irq_domain_activate_irq
+ * [0] irq_domain_activate_irq
+ * [0] irq_startup
+ * [0] __setup_irq
+ * [0] request_threaded_irq
+ * [0] pci_request_irq
+ * [0] queue_request_irq
+ * [0] nvme_reset_work
+ * [0] process_one_work
+ * [0] worker_thread
+ * [0] kthread
+ * [0] ret_from_fork
+ *
+ * called by:
+ * - arch/x86/kernel/apic/vector.c|199| <<vector_assign_managed_shutdown>> apic_update_irq_cfg(irqd, MANAGED_IRQ_SHUTDOWN_VECTOR, cpu);
+ * - arch/x86/kernel/apic/vector.c|281| <<assign_vector_locked>> apic_update_irq_cfg(irqd, apicd->vector, apicd->cpu);
+ * - arch/x86/kernel/apic/vector.c|354| <<assign_managed_vector>> apic_update_irq_cfg(irqd, vector, cpu);
+ * - arch/x86/kernel/apic/vector.c|534| <<vector_configure_legacy>> apic_update_irq_cfg(irqd, apicd->vector, apicd->cpu);
+ */
static void apic_update_irq_cfg(struct irq_data *irqd, unsigned int vector,
unsigned int cpu)
{
@@ -314,6 +342,13 @@ assign_irq_vector_policy(struct irq_data *irqd, struct irq_alloc_info *info)
return reserve_irq_vector(irqd);
}
+/*
+ * called by:
+ * - activate_managed()
+ * - apic_set_affinity()
+ *
+ * 在这里分配的vector
+ */
static int
assign_managed_vector(struct irq_data *irqd, const struct cpumask *dest)
{
@@ -713,6 +748,9 @@ int __init arch_early_irq_init(void)
* Allocate the vector matrix allocator data structure and limit the
* search area.
*/
+ /*
+ * 一个cpu支持256个vector?
+ */
vector_matrix = irq_alloc_matrix(NR_VECTORS, FIRST_EXTERNAL_VECTOR,
FIRST_SYSTEM_VECTOR);
BUG_ON(!vector_matrix);
@@ -809,6 +847,7 @@ static int apic_retrigger_irq(struct irq_data *irqd)
return 1;
}
+/* struct irq_chip intel_ir_chip.irq_ack = apic_ack_irq() */
void apic_ack_irq(struct irq_data *irqd)
{
irq_move_irq(irqd);
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index a2486f4..75325dc 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -520,6 +520,7 @@ struct acpi_table_header *tboot_get_dmar_table(struct acpi_table_header *dmar_tb
return dmar_tbl;
}
+/* 如果有CONFIG_INTEL_TXT会用如下函数 */
int tboot_force_iommu(void)
{
if (!tboot_enabled())
diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c
index 460bed4..ddb212a 100644
--- a/drivers/iommu/dmar.c
+++ b/drivers/iommu/dmar.c
@@ -65,6 +65,9 @@ struct dmar_res_callback {
* 2) Use RCU in interrupt context
*/
DECLARE_RWSEM(dmar_global_lock);
+/*
+ * 链接着所有的struct dmar_drhd_unit (有个field是intel_iommu指针)
+ */
LIST_HEAD(dmar_drhd_units);
struct acpi_table_header * __initdata dmar_tbl;
@@ -76,6 +79,13 @@ static void free_iommu(struct intel_iommu *iommu);
extern const struct iommu_ops intel_iommu_ops;
+/*
+ * called only by dmar_parse_one_drhd()
+ *
+ * drhd中有一个field是intel_iommu的指针
+ *
+ * 把struct dmar_drhd_unit (有个field是intel_iommu指针)链入链表dmar_drhd_units
+ */
static void dmar_register_drhd_unit(struct dmar_drhd_unit *drhd)
{
/*
@@ -88,6 +98,12 @@ static void dmar_register_drhd_unit(struct dmar_drhd_unit *drhd)
list_add_rcu(&drhd->list, &dmar_drhd_units);
}
+/*
+ * called by:
+ * - dmar_parse_one_drhd()
+ * - dmar_parse_one_rmrr()
+ * - dmar_parse_one_atsr()
+ */
void *dmar_alloc_dev_scope(void *start, void *end, int *cnt)
{
struct acpi_dmar_device_scope *scope;
@@ -369,6 +385,7 @@ static struct notifier_block dmar_pci_bus_nb = {
.priority = INT_MIN,
};
+/* 在dmar_drhd_units链表里搜索 */
static struct dmar_drhd_unit *
dmar_find_dmaru(struct acpi_dmar_hardware_unit *drhd)
{
@@ -387,17 +404,27 @@ dmar_find_dmaru(struct acpi_dmar_hardware_unit *drhd)
* structure which uniquely represent one DMA remapping hardware unit
* present in the platform
*/
+/*
+ * 非常主要的函数 初始化了unit的iommu!!
+ *
+ * used at 2 locations:
+ * - ACPI_DMAR_TYPE_HARDWARE_UNIT的callback
+ * - 在dmar_hotplug_insert()用到过
+ */
static int dmar_parse_one_drhd(struct acpi_dmar_header *header, void *arg)
{
struct acpi_dmar_hardware_unit *drhd;
struct dmar_drhd_unit *dmaru;
int ret;
+ /* struct acpi_dmar_hardware_unit开始包含了一个acpi_dmar_header */
drhd = (struct acpi_dmar_hardware_unit *)header;
+ /* 在dmar_drhd_units链表里搜索 */
dmaru = dmar_find_dmaru(drhd);
if (dmaru)
goto out;
+ /* 分配一个struct dmar_drhd_unit, 里面有一个struct intel_iommu指针 */
dmaru = kzalloc(sizeof(*dmaru) + header->length, GFP_KERNEL);
if (!dmaru)
return -ENOMEM;
@@ -408,9 +435,11 @@ static int dmar_parse_one_drhd(struct acpi_dmar_header *header, void *arg)
*/
dmaru->hdr = (void *)(dmaru + 1);
memcpy(dmaru->hdr, header, header->length);
+ /* iommu寄存器的基地址 */
dmaru->reg_base_addr = drhd->address;
dmaru->segment = drhd->segment;
dmaru->include_all = drhd->flags & 0x1; /* BIT0: INCLUDE_ALL */
+ /* 分配了一些struct dmar_dev_scope */
dmaru->devices = dmar_alloc_dev_scope((void *)(drhd + 1),
((void *)drhd) + drhd->header.length,
&dmaru->devices_cnt);
@@ -419,6 +448,7 @@ static int dmar_parse_one_drhd(struct acpi_dmar_header *header, void *arg)
return -ENOMEM;
}
+ /* 非常重要的函数 为struct dmar_drhd_unit分配intel_iommu */
ret = alloc_iommu(dmaru);
if (ret) {
dmar_free_dev_scope(&dmaru->devices,
@@ -426,6 +456,7 @@ static int dmar_parse_one_drhd(struct acpi_dmar_header *header, void *arg)
kfree(dmaru);
return ret;
}
+ /* 把struct dmar_drhd_unit (有个field是intel_iommu指针)链入链表dmar_drhd_units */
dmar_register_drhd_unit(dmaru);
out:
@@ -539,6 +570,11 @@ dmar_table_print_dmar_entry(struct acpi_dmar_header *header)
/**
* dmar_table_detect - checks to see if the platform supports DMAR devices
*/
+/*
+ * called by:
+ * - parse_dmar_table()
+ * - detect_intel_iommu()
+ */
static int __init dmar_table_detect(void)
{
acpi_status status = AE_OK;
@@ -812,11 +848,18 @@ void __init dmar_register_bus_notifier(void)
}
+/*
+ * called by:
+ * - intel_iommu_init()
+ * - intel_prepare_irq_remapping()
+ */
int __init dmar_table_init(void)
{
+ /* 这个static变量只在dmar_table_init()改变! */
static int dmar_table_initialized;
int ret;
+ /* 这个static变量只在dmar_table_init()改变! */
if (dmar_table_initialized == 0) {
ret = parse_dmar_table();
if (ret < 0) {
@@ -827,12 +870,14 @@ int __init dmar_table_init(void)
ret = -ENODEV;
}
+ /* 这个static变量只在dmar_table_init()改变! */
if (ret < 0)
dmar_table_initialized = ret;
else
dmar_table_initialized = 1;
}
+ /* 这个static变量只在dmar_table_init()改变! */
return dmar_table_initialized < 0 ? dmar_table_initialized : 0;
}
@@ -870,6 +915,7 @@ dmar_validate_one_drhd(struct acpi_dmar_header *entry, void *arg)
return -EINVAL;
}
+ /* Hardware supported capabilities */
cap = dmar_readq(addr + DMAR_CAP_REG);
ecap = dmar_readq(addr + DMAR_ECAP_REG);
@@ -886,6 +932,9 @@ dmar_validate_one_drhd(struct acpi_dmar_header *entry, void *arg)
return 0;
}
+/*
+ * 探测平台环境上是否有IOMMU硬件
+ */
int __init detect_intel_iommu(void)
{
int ret;
@@ -933,6 +982,9 @@ static void unmap_iommu(struct intel_iommu *iommu)
* Memory map the iommu's registers. Start w/ a single page, and
* possibly expand if that turns out to be insufficent.
*/
+/*
+ * called only by alloc_iommu()
+ */
static int map_iommu(struct intel_iommu *iommu, u64 phys_addr)
{
int map_size, err=0;
@@ -1016,6 +1068,16 @@ static void dmar_free_seq_id(struct intel_iommu *iommu)
}
}
+/*
+ * 除了AMD特别的部分, 仅仅被dmar_parse_one_drhd()调用
+ *
+ * The host platform may support one or more remapping hardware units. Each hardware unit
+ * supports remapping DMA requests originating within its hardware scope. For example, a
+ * desktop platform may expose a single remapping hardware unit that translates all DMA
+ * transactions at the memory controller hub (MCH) component. A server platform with one
+ * or more core components may support independent translation hardware units in each
+ * component.
+ */
static int alloc_iommu(struct dmar_drhd_unit *drhd)
{
struct intel_iommu *iommu;
@@ -1024,6 +1086,7 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd)
int msagaw = 0;
int err;
+ /* 此unit的iommu的寄存器基地址 */
if (!drhd->reg_base_addr) {
warn_invalid_dmar(0, "");
return -EINVAL;
@@ -1039,6 +1102,9 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd)
goto error;
}
+ /*
+ * map这个iommu的寄存器 比如base, cap等
+ */
err = map_iommu(iommu, drhd->reg_base_addr);
if (err) {
pr_err("Failed to map %s\n", iommu->name);
@@ -1046,12 +1112,18 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd)
}
err = -EINVAL;
+ /*
+ * Adjusted Guest-Address Width (AGAW)
+ */
agaw = iommu_calculate_agaw(iommu);
if (agaw < 0) {
pr_err("Cannot get a valid agaw for iommu (seq_id = %d)\n",
iommu->seq_id);
goto err_unmap;
}
+ /*
+ * Supported Adjusted Guest-Address Width (SAGAW)
+ */
msagaw = iommu_calculate_max_sagaw(iommu);
if (msagaw < 0) {
pr_err("Cannot get a valid max agaw for iommu (seq_id = %d)\n",
@@ -1064,6 +1136,7 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd)
iommu->node = -1;
+ /* Arch version supported by this IOMMU */
ver = readl(iommu->reg + DMAR_VER_REG);
pr_info("%s: reg_base_addr %llx ver %d:%d cap %llx ecap %llx\n",
iommu->name,
@@ -1612,6 +1685,9 @@ static int dmar_fault_do_one(struct intel_iommu *iommu, int type,
}
#define PRIMARY_FAULT_REG_LEN (16)
+/*
+ * DMA Remapping 缺页的中断函数
+ */
irqreturn_t dmar_fault(int irq, void *dev_id)
{
struct intel_iommu *iommu = dev_id;
@@ -1687,6 +1763,12 @@ irqreturn_t dmar_fault(int irq, void *dev_id)
return IRQ_HANDLED;
}
+/*
+ * called by:
+ * - enable_drhd_fault_handling()
+ * - init_dmars()
+ * - intel_iommu_add()
+ */
int dmar_set_interrupt(struct intel_iommu *iommu)
{
int irq, ret;
@@ -1697,6 +1779,9 @@ int dmar_set_interrupt(struct intel_iommu *iommu)
if (iommu->irq)
return 0;
+ /*
+ * sequence id of the iommu
+ */
irq = dmar_alloc_hwirq(iommu->seq_id, iommu->node, iommu);
if (irq > 0) {
iommu->irq = irq;
@@ -1711,6 +1796,9 @@ int dmar_set_interrupt(struct intel_iommu *iommu)
return ret;
}
+/*
+ * struct irq_remap_ops intel_irq_remap_ops.enable_faulting = enable_drhd_fault_handling()
+ */
int __init enable_drhd_fault_handling(void)
{
struct dmar_drhd_unit *drhd;
@@ -2035,6 +2123,7 @@ static int dmar_device_hotplug(acpi_handle handle, bool insert)
return ret;
}
+/* called only by acpi_pci_root_add() */
int dmar_device_add(acpi_handle handle)
{
return dmar_device_hotplug(handle, true);
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 6392a49..2b33df6 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -53,6 +53,47 @@
#include "irq_remapping.h"
+/*
+ * Host平台上可能会存在一个或者多个DMA Remapping硬件单元, 而每个硬件单元支持在它管理的
+ * 设备范围内的所有设备的DMA Remapping. 例如, 你的台式机CPU Core i7 7700k在MCH中只集成
+ * 一个DMA Remapping硬件单元(IOMMU), 但在多路服务器上可能集成有多个DMA Remapping硬件单
+ * 元. 每个硬件单元负责管理挂载到它所在的PCIe Root Port下所有设备的DMA请求. BIOS会将平
+ * 台上的DMA Remapping硬件信息通过ACPI协议报告给操作系统, 再由操作系统来初始化和管理这
+ * 些硬件设备.
+ *
+ * 在多路服务器上我们可以有多个DMAR Unit (这里可以直接理解为多个IOMMU硬件), 每个DMAR会
+ * 负责处理其下挂载设备的DMA请求进行地址翻译. 例如, PCIE Root Port (dev:fun) (14:0)下面
+ * 挂载的所有设备的DMA请求由DMAR #1负责处理, PCIE Root Port (dev:fun) (14:1)下面挂载的
+ * 所有设备的DMA请求由DMAR #2负责处理, 而DMAR #3下挂载的是一个Root-Complex集成设备[29:0],
+ * 这个设备的DMA请求被DMAR #3承包, DMAR #4的情况比较复杂, 它负责处理Root-Complex集成设
+ * 备[30:0]以及I/OxAPIC设备的DMA请求.
+ *
+ * 这些和IOMMU相关的硬件拓扑信息需要BIOS通过ACPI表呈现给OS, 这样OS才能正确驱动IOMMU硬件工作。
+ */
+
+/*
+ * DRHD: DMA Remapping Hardware Unit Definition 用来描述DMAR Unit(IOMMU)的基本信息
+ *
+ * RMRR: Reserved Memory Region Reporting 用来描述那些保留的物理地址, 这段地址空间不被重映射
+ *
+ * ATSR: Root Port ATS Capability 仅限于有Device-TLB的情形, Root Port需要向OS报告支持ATS的能力
+ *
+ * RHSA: Remapping Hardware Static Affinity Remapping亲和性, 在有NUMA的系统下可以提升DMA Remapping的性能
+ */
+
+/*
+ * IOMMU硬件会截获直通设备发出的请求, 然后根据其Request ID查表找到对应的Address Translation Structure
+ * 即该Domain的IOMMU页表基地址, 这样一来该设备的DMA地址翻译就只会按这个Domain的IOMMU页表的方式进行翻译,
+ * 翻译后的HPA必然落在此Domain的地址空间内(这个过程由IOMMU硬件中自动完成), 而不会访问到其他Domain的地址
+ * 空间, 这样就达到了DMA隔离的目的.
+ */
+
+/*
+ * # acpidump -b
+ * # iasl -d dmar.dat
+ * # vim dmar.dsl
+ */
+
#define ROOT_SIZE VTD_PAGE_SIZE
#define CONTEXT_SIZE VTD_PAGE_SIZE
@@ -172,6 +213,7 @@ static inline unsigned long virt_to_dma_pfn(void *p)
}
/* global iommu list, set NULL for ignored DMAR units */
+/* 数组是在init_dmars()上分配的 */
static struct intel_iommu **g_iommus;
static void __init check_tylersburg_isoch(void);
@@ -194,12 +236,18 @@ struct root_entry {
u64 lo;
u64 hi;
};
+/*
+ * VT-d hardware uses 4KiB page size regardless of host page size.
+ *
+ * Used only by free_context_table()
+ */
#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
/*
* Take a root_entry and return the Lower Context Table Pointer (LCTP)
* if marked present.
*/
+/* called only by copy_context_table() */
static phys_addr_t root_entry_lctp(struct root_entry *re)
{
if (!(re->lo & 1))
@@ -212,6 +260,7 @@ static phys_addr_t root_entry_lctp(struct root_entry *re)
* Take a root_entry and return the Upper Context Table Pointer (UCTP)
* if marked present.
*/
+/* called only by copy_context_table() */
static phys_addr_t root_entry_uctp(struct root_entry *re)
{
if (!(re->hi & 1))
@@ -284,6 +333,11 @@ static inline void context_set_translation_type(struct context_entry *context,
context->lo |= (value & 3) << 2;
}
+/*
+ * called only by domain_context_mapping_one()
+ *
+ * 设置context entry的页表base
+ */
static inline void context_set_address_root(struct context_entry *context,
unsigned long value)
{
@@ -297,12 +351,19 @@ static inline void context_set_address_width(struct context_entry *context,
context->hi |= value & 7;
}
+/* called only by domain_context_mapping_one() */
static inline void context_set_domain_id(struct context_entry *context,
unsigned long value)
{
context->hi |= (value & ((1 << 16) - 1)) << 8;
}
+/*
+ * called by:
+ * - domain_context_mapping_one()
+ * - domain_context_clear_one()
+ * - copy_context_table()
+ */
static inline int context_domain_id(struct context_entry *c)
{
return((c->hi >> 8) & 0xffff);
@@ -375,6 +436,7 @@ static int hw_pass_through = 1;
/* si_domain contains mulitple devices */
#define DOMAIN_FLAG_STATIC_IDENTITY (1 << 1)
+/* 每个iommu unit引用这个domain的iommu_refcnt */
#define for_each_domain_iommu(idx, domain) \
for (idx = 0; idx < g_num_of_iommus; idx++) \
if (domain->iommu_refcnt[idx])
@@ -382,7 +444,7 @@ static int hw_pass_through = 1;
struct dmar_domain {
int nid; /* node id */
- unsigned iommu_refcnt[DMAR_UNITS_SUPPORTED];
+ unsigned iommu_refcnt[DMAR_UNITS_SUPPORTED]; // 每个iommu unit引用这个domain的cnt
/* Refcount of devices per iommu */
@@ -395,6 +457,10 @@ struct dmar_domain {
struct list_head devices; /* all devices' list */
struct iova_domain iovad; /* iova's that belong to this domain */
+ /*
+ * iommu domain的root entry, 不是iommu的root
+ * 在domain_init()分配
+ */
struct dma_pte *pgd; /* virtual address */
int gaw; /* max guest address width */
@@ -425,7 +491,7 @@ struct device_domain_info {
u8 pasid_enabled:1;
u8 pri_supported:1;
u8 pri_enabled:1;
- u8 ats_supported:1;
+ u8 ats_supported:1; /* Address Translation Services */
u8 ats_enabled:1;
u8 ats_qdep;
struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
@@ -470,12 +536,17 @@ static void domain_context_clear(struct intel_iommu *iommu,
static int domain_detach_iommu(struct dmar_domain *domain,
struct intel_iommu *iommu);
+/*
+ * 如果intel_iommu=on, dmar_disabled = 0
+ * 如果intel_iommu=off, dmar_disabled = 1
+ */
#ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
int dmar_disabled = 0;
#else
int dmar_disabled = 1;
#endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
+/* 在intel_iommu_init()设置成1 */
int intel_iommu_enabled = 0;
EXPORT_SYMBOL_GPL(intel_iommu_enabled);
@@ -483,7 +554,7 @@ static int dmar_map_gfx = 1;
static int dmar_forcedac;
static int intel_iommu_strict;
static int intel_iommu_superpage = 1;
-static int intel_iommu_ecs = 1;
+static int intel_iommu_ecs = 1; // 是否支持extended context table, 如果intel_iommu=ecs_off就一定是0
static int intel_iommu_pasid28;
static int iommu_identity_mapping;
@@ -535,6 +606,10 @@ static void clear_translation_pre_enabled(struct intel_iommu *iommu)
iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
}
+/*
+ * 对于intel, 只被init_dmars()调用
+ * 对于amd, 只被init_iommu_one()调用
+ */
static void init_translation_status(struct intel_iommu *iommu)
{
u32 gsts;
@@ -550,6 +625,9 @@ static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
return container_of(dom, struct dmar_domain, domain);
}
+/*
+ * 在下面用来解析"intel_iommu="
+ */
static int __init intel_iommu_setup(char *str)
{
if (!str)
@@ -707,6 +785,9 @@ static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
/*
* Calculate max SAGAW for each iommu.
*/
+/*
+ * Supported Adjusted Guest-Address Width (SAGAW)
+ */
int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
{
return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
@@ -717,18 +798,31 @@ int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
* "SAGAW" may be different across iommus, use a default agaw, and
* get a supported less agaw for iommus that don't support the default agaw.
*/
+/*
+ * Adjusted Guest-Address Width (AGAW)
+ * Supported Adjusted Guest-Address Width (SAGAW)
+ */
int iommu_calculate_agaw(struct intel_iommu *iommu)
{
return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
}
/* This functionin only returns single iommu in a domain */
+/*
+ * called by:
+ * - __intel_map_single()
+ * - intel_unmap()
+ * - intel_map_sg()
+ *
+ * 把一个struct dmar_domain转换成struct intel_iommu: 最终还是通过g_iommus
+ */
static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
{
int iommu_id;
/* si_domain and vm domain should not get here. */
BUG_ON(domain_type_is_vm_or_si(domain));
+ /* 对于每一个iommu的idx, 查看domain->iommu_refcnt[idx]是否为 0 */
for_each_domain_iommu(iommu_id, domain)
break;
@@ -768,6 +862,12 @@ static void domain_update_iommu_coherency(struct dmar_domain *domain)
rcu_read_unlock();
}
+/*
+ * called by:
+ * - domain_update_iommu_cap()
+ * - intel_iommu_add()
+ * - intel_iommu_capable()
+ */
static int domain_update_iommu_snooping(struct intel_iommu *skip)
{
struct dmar_drhd_unit *drhd;
@@ -788,6 +888,11 @@ static int domain_update_iommu_snooping(struct intel_iommu *skip)
return ret;
}
+/*
+ * called by:
+ * - domain_update_iommu_cap()
+ * - intel_iommu_add()
+ */
static int domain_update_iommu_superpage(struct intel_iommu *skip)
{
struct dmar_drhd_unit *drhd;
@@ -820,14 +925,42 @@ static void domain_update_iommu_cap(struct dmar_domain *domain)
domain->iommu_superpage = domain_update_iommu_superpage(NULL);
}
+/*
+ * called by:
+ * - device_context_mapped()
+ * - free_context_table() --> 两次
+ * - domain_context_mapping_one()
+ * - domain_context_clear_one()
+ * - intel_iommu_enable_pasid()
+ *
+ * 根据bus, dev和func号 获取对于context entry的地址
+ */
static inline struct context_entry *iommu_context_addr(struct intel_iommu *iommu,
u8 bus, u8 devfn, int alloc)
{
+ /* 根据bus号从root entry table中获取相应entry */
struct root_entry *root = &iommu->root_entry[bus];
struct context_entry *context;
u64 *entry;
+ /* 根据root table entry获取其指向的context entry table */
entry = &root->lo;
+ /*
+ * For implementations supporting Extended-Context-Support (ECS=1 in Extended
+ * Capability Register), the Root Table Address Register (RTADDR_REG) points to
+ * an extended-root-table when Root-Table-Type field in the Register is Set (RTT=1).
+ *
+ * For implementations supporting Extended-Context-Support (ECS=1 in Extended
+ * Capability Register), when using extended-root-table, each extended-root-entry
+ * references a lower-context-table and a upper-context-table.
+ *
+ * The lower-context-table is 4-Kbye in size and contains 128-extended-context-entries
+ * corresponding to PCI functions in device range 0-15 on the bus.
+ *
+ * The upper-context-table is also 4-Kbyte in size and contains 128
+ * extended-context-entries corresponding to PCI functions in device range 16-21 on
+ * the bus.
+ */
if (ecs_enabled(iommu)) {
if (devfn >= 0x80) {
devfn -= 0x80;
@@ -851,6 +984,7 @@ static inline struct context_entry *iommu_context_addr(struct intel_iommu *iommu
*entry = phy_addr | 1;
__iommu_flush_cache(iommu, entry, sizeof(*entry));
}
+ /* 根据devfn, 返回context entry table中的entry */
return &context[devfn];
}
@@ -859,6 +993,20 @@ static int iommu_dummy(struct device *dev)
return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
}
+/*
+ * called by
+ * - domain_context_mapped() 2次
+ * - find_or_alloc_domain()
+ * - set_domain_for_dev()
+ * - domain_add_dev_info()
+ * - intel_iommu_attach_device()
+ * - intel_iommu_add_device()
+ * - intel_iommu_remove_device()
+ * - intel_svm_device_to_iommu()
+ *
+ * 遍历所有dmar_drhd_units链表中的iommu hardware unit 寻找满足某种条件的iommu
+ * 其实就是返回device所属的iommu
+ */
static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
{
struct dmar_drhd_unit *drhd = NULL;
@@ -891,6 +1039,7 @@ static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devf
dev = &ACPI_COMPANION(dev)->dev;
rcu_read_lock();
+ /* 遍历所有dmar_drhd_units链表中的iommu hardware unit */
for_each_active_iommu(iommu, drhd) {
if (pdev && segment != drhd->segment)
continue;
@@ -941,6 +1090,10 @@ static void domain_flush_cache(struct dmar_domain *domain,
clflush_cache_range(addr, size);
}
+/*
+ * root entry中存着bus number的指针
+ * context entry中存折dev和fn组合的指针
+ */
static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
{
struct context_entry *context;
@@ -955,6 +1108,9 @@ static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
return ret;
}
+/*
+ * called only by free_dmar_iommu()
+ */
static void free_context_table(struct intel_iommu *iommu)
{
int i;
@@ -966,6 +1122,7 @@ static void free_context_table(struct intel_iommu *iommu)
goto out;
}
for (i = 0; i < ROOT_ENTRY_NR; i++) {
+ /* 根据bus, dev和func号 获取对于context entry的地址 */
context = iommu_context_addr(iommu, i, 0, 0);
if (context)
free_pgtable_page(context);
@@ -984,6 +1141,12 @@ static void free_context_table(struct intel_iommu *iommu)
spin_unlock_irqrestore(&iommu->lock, flags);
}
+/*
+ * called by:
+ * - __domain_mapping()
+ * - intel_iommu_unmap()
+ * - intel_iommu_iova_to_phys()
+ */
static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
unsigned long pfn, int *target_level)
{
@@ -1250,6 +1413,13 @@ static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
/* We can't just free the pages because the IOMMU may still be walking
the page tables, and may have cached the intermediate levels. The
pages can only be freed after the IOTLB flush has been done. */
+/*
+ * called by:
+ * - domain_exit()
+ * - intel_unmap()
+ * - intel_iommu_memory_notifier()
+ * - intel_iommu_unmap()
+ */
static struct page *domain_unmap(struct dmar_domain *domain,
unsigned long start_pfn,
unsigned long last_pfn)
@@ -1294,11 +1464,20 @@ static void iova_entry_free(unsigned long data)
}
/* iommu handling */
+/*
+ * called by:
+ * - init_dmars()
+ * - intel_iommu_add()
+ *
+ * 为当前iommu (unit)分配第一层的root_entry (里面存着bus entry) 虚拟地址
+ * 在别的地方会用iommu_set_root_entry()写入寄存器
+ */
static int iommu_alloc_root_entry(struct intel_iommu *iommu)
{
struct root_entry *root;
unsigned long flags;
+ /* 分配一个page (order=0), 返回虚拟地址 */
root = (struct root_entry *)alloc_pgtable_page(iommu->node);
if (!root) {
pr_err("Allocating root entry for %s failed\n",
@@ -1315,6 +1494,34 @@ static int iommu_alloc_root_entry(struct intel_iommu *iommu)
return 0;
}
+/*
+ * DMAR_RTADDR_REG: Root entry table
+ *
+ * The root-table functions as the top level structure to map devices to their
+ * respective domains. The location of the root-table in system memory is programmed
+ * through the Root Table Address Register. The root-table is 4-KByte in size and
+ * contains 256 root-entries to cover the PCI bus number space (0-255).
+ *
+ * The bus number (upper 8-bits) encoded in a request's source-id field is used to
+ * index into the root-entry structure.
+ *
+ * The Root Entry contains the Context-table pointer, which references the
+ * context-table for devices on the bus identified by the root-entry.
+ *
+ * The Extended Root Entry has has more fields than Root Entry:
+ * - Lower Present Flag
+ * - Lower Context-table pointer
+ * - Upper Present flag
+ * - Upper Context-table pointer
+ *
+ *
+ * Called by:
+ * - init_dmars()
+ * - init_iommu_hw()
+ * - intel_iommu_add()
+ *
+ * 把iommu的root table的地址写入DMAR_RTADDR_REG: Root entry table
+ */
static void iommu_set_root_entry(struct intel_iommu *iommu)
{
u64 addr;
@@ -1641,6 +1848,18 @@ static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
}
+/*
+ * DMA_GCMD_TE: Software writes to this field to request hardware to
+ * enable/disable DMA remapping
+ *
+ * 0: Disable DMA remapping
+ * 1: Enable DMA remapping
+ *
+ * called by:
+ * - init_dmars()
+ * - init_iommu_hw()
+ * - intel_iommu_add()
+ */
static void iommu_enable_translation(struct intel_iommu *iommu)
{
u32 sts;
@@ -1657,6 +1876,14 @@ static void iommu_enable_translation(struct intel_iommu *iommu)
raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
}
+/*
+ * called by:
+ * - disable_dmar_iommu()
+ * - init_dmars() 两次
+ * - iommu_suspend()
+ * - intel_iommu_add()
+ * - intel_disable_iommus()
+ */
static void iommu_disable_translation(struct intel_iommu *iommu)
{
u32 sts;
@@ -1674,18 +1901,31 @@ static void iommu_disable_translation(struct intel_iommu *iommu)
}
+/*
+ * called by:
+ * - init_dmars()
+ * - intel_iommu_add()
+ *
+ * 初始化intel_iommu的二维(struct dmar_domain *)数组
+ */
static int iommu_init_domains(struct intel_iommu *iommu)
{
u32 ndomains, nlongs;
size_t size;
+ /* 获取该iommu unit支持的最大domain数目 */
ndomains = cap_ndoms(iommu->cap);
pr_debug("%s: Number of Domains supported <%d>\n",
iommu->name, ndomains);
+ /*
+ * ndomains是bit
+ * nlongs是数目
+ */
nlongs = BITS_TO_LONGS(ndomains);
spin_lock_init(&iommu->lock);
+ /* 分配 bitmap of domains */
iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
if (!iommu->domain_ids) {
pr_err("%s: Allocating domain id array failed\n",
@@ -1693,7 +1933,15 @@ static int iommu_init_domains(struct intel_iommu *iommu)
return -ENOMEM;
}
+ /*
+ * iommu->domains是三维数组
+ *
+ * 其实一共两维 第二维存的指针 (struct dmar_domain *)
+ *
+ * 第一维指向若干indirect内存 indirect内存有256个 (struct dmar_domain *)
+ */
size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
+ /* struct dmar_domain ***domains; */
iommu->domains = kzalloc(size, GFP_KERNEL);
if (iommu->domains) {
@@ -1793,10 +2041,76 @@ static void free_dmar_iommu(struct intel_iommu *iommu)
#endif
}
+/*
+ * alloc_domain()的几个例子
+ *
+ * [ 0.328140] alloc_domain