Skip to content

Commit 05c68e2

Browse files
authored
Add collector for PCIe devices with link information (prometheus#3339)
* Add collector for PCIe devices with link information The link status of PCIe devices sometimes changes, like link or speed downgrades, and devices disappear. This patch collects PCIe devices' link infromation to detect such failures. As a first step, this collector exports PCIe devices' - Device information (vendor_id, device_id, etc.) - Parent PCIe device (e.g. PCIe bridge, PCIe switch) - Link status (max_link_{transfers_per_second|width}, current_link_{transfers_per_second|width}) --------- Signed-off-by: Naoki MATSUMOTO <[email protected]>
1 parent 581a909 commit 05c68e2

File tree

6 files changed

+2912
-417
lines changed

6 files changed

+2912
-417
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,7 @@ logind | Exposes session counts from [logind](http://www.freedesktop.org/wiki/So
201201
meminfo\_numa | Exposes memory statistics from `/sys/devices/system/node/node[0-9]*/meminfo`, `/sys/devices/system/node/node[0-9]*/numastat`. | Linux
202202
mountstats | Exposes filesystem statistics from `/proc/self/mountstats`. Exposes detailed NFS client statistics. | Linux
203203
network_route | Exposes the routing table as metrics | Linux
204+
pcidevice | Exposes pci devices' information including their link status and parent devices. | Linux
204205
perf | Exposes perf based metrics (Warning: Metrics are dependent on kernel configuration and settings). | Linux
205206
processes | Exposes aggregate process statistics from `/proc`. | Linux
206207
qdisc | Exposes [queuing discipline](https://en.wikipedia.org/wiki/Network_scheduler#Linux_kernel) statistics | Linux

collector/fixtures/e2e-64k-page-output.txt

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2832,6 +2832,26 @@ node_os_info{build_id="",id="ubuntu",id_like="debian",image_id="",image_version=
28322832
# HELP node_os_version Metric containing the major.minor part of the OS version.
28332833
# TYPE node_os_version gauge
28342834
node_os_version{id="ubuntu",id_like="debian",name="Ubuntu"} 20.04
2835+
# HELP node_pcidevice_current_link_transfers_per_second Value of current link's transfers per second (T/s)
2836+
# TYPE node_pcidevice_current_link_transfers_per_second gauge
2837+
node_pcidevice_current_link_transfers_per_second{bus="00",device="02",function="1",segment="0000"} 8e+09
2838+
node_pcidevice_current_link_transfers_per_second{bus="01",device="00",function="0",segment="0000"} 8e+09
2839+
# HELP node_pcidevice_current_link_width Value of current link's width (number of lanes)
2840+
# TYPE node_pcidevice_current_link_width gauge
2841+
node_pcidevice_current_link_width{bus="00",device="02",function="1",segment="0000"} 4
2842+
node_pcidevice_current_link_width{bus="01",device="00",function="0",segment="0000"} 4
2843+
# HELP node_pcidevice_info Non-numeric data from /sys/bus/pci/devices/<location>, value is always 1.
2844+
# TYPE node_pcidevice_info gauge
2845+
node_pcidevice_info{bus="00",class_id="0x060400",device="02",function="1",parent_bus="*",parent_device="*",parent_function="*",parent_segment="*",revision="0x00",segment="0000",subsystem_device_id="0x5095",subsystem_vendor_id="0x17aa",vendor_id="0x1634"} 1
2846+
node_pcidevice_info{bus="01",class_id="0x010802",device="00",function="0",parent_bus="00",parent_device="02",parent_function="1",parent_segment="0000",revision="0x01",segment="0000",subsystem_device_id="0x5021",subsystem_vendor_id="0xc0a9",vendor_id="0x540a"} 1
2847+
# HELP node_pcidevice_max_link_transfers_per_second Value of maximum link's transfers per second (T/s)
2848+
# TYPE node_pcidevice_max_link_transfers_per_second gauge
2849+
node_pcidevice_max_link_transfers_per_second{bus="00",device="02",function="1",segment="0000"} 8e+09
2850+
node_pcidevice_max_link_transfers_per_second{bus="01",device="00",function="0",segment="0000"} 1.6e+10
2851+
# HELP node_pcidevice_max_link_width Value of maximum link's width (number of lanes)
2852+
# TYPE node_pcidevice_max_link_width gauge
2853+
node_pcidevice_max_link_width{bus="00",device="02",function="1",segment="0000"} 8
2854+
node_pcidevice_max_link_width{bus="01",device="00",function="0",segment="0000"} 4
28352855
# HELP node_power_supply_capacity capacity value of /sys/class/power_supply/<power_supply>.
28362856
# TYPE node_power_supply_capacity gauge
28372857
node_power_supply_capacity{power_supply="BAT0"} 81
@@ -2991,6 +3011,7 @@ node_scrape_collector_success{collector="nfs"} 1
29913011
node_scrape_collector_success{collector="nfsd"} 1
29923012
node_scrape_collector_success{collector="nvme"} 1
29933013
node_scrape_collector_success{collector="os"} 1
3014+
node_scrape_collector_success{collector="pcidevice"} 1
29943015
node_scrape_collector_success{collector="powersupplyclass"} 1
29953016
node_scrape_collector_success{collector="pressure"} 1
29963017
node_scrape_collector_success{collector="processes"} 1

collector/fixtures/e2e-output.txt

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2854,6 +2854,26 @@ node_os_info{build_id="",id="ubuntu",id_like="debian",image_id="",image_version=
28542854
# HELP node_os_version Metric containing the major.minor part of the OS version.
28552855
# TYPE node_os_version gauge
28562856
node_os_version{id="ubuntu",id_like="debian",name="Ubuntu"} 20.04
2857+
# HELP node_pcidevice_current_link_transfers_per_second Value of current link's transfers per second (T/s)
2858+
# TYPE node_pcidevice_current_link_transfers_per_second gauge
2859+
node_pcidevice_current_link_transfers_per_second{bus="00",device="02",function="1",segment="0000"} 8e+09
2860+
node_pcidevice_current_link_transfers_per_second{bus="01",device="00",function="0",segment="0000"} 8e+09
2861+
# HELP node_pcidevice_current_link_width Value of current link's width (number of lanes)
2862+
# TYPE node_pcidevice_current_link_width gauge
2863+
node_pcidevice_current_link_width{bus="00",device="02",function="1",segment="0000"} 4
2864+
node_pcidevice_current_link_width{bus="01",device="00",function="0",segment="0000"} 4
2865+
# HELP node_pcidevice_info Non-numeric data from /sys/bus/pci/devices/<location>, value is always 1.
2866+
# TYPE node_pcidevice_info gauge
2867+
node_pcidevice_info{bus="00",class_id="0x060400",device="02",function="1",parent_bus="*",parent_device="*",parent_function="*",parent_segment="*",revision="0x00",segment="0000",subsystem_device_id="0x5095",subsystem_vendor_id="0x17aa",vendor_id="0x1634"} 1
2868+
node_pcidevice_info{bus="01",class_id="0x010802",device="00",function="0",parent_bus="00",parent_device="02",parent_function="1",parent_segment="0000",revision="0x01",segment="0000",subsystem_device_id="0x5021",subsystem_vendor_id="0xc0a9",vendor_id="0x540a"} 1
2869+
# HELP node_pcidevice_max_link_transfers_per_second Value of maximum link's transfers per second (T/s)
2870+
# TYPE node_pcidevice_max_link_transfers_per_second gauge
2871+
node_pcidevice_max_link_transfers_per_second{bus="00",device="02",function="1",segment="0000"} 8e+09
2872+
node_pcidevice_max_link_transfers_per_second{bus="01",device="00",function="0",segment="0000"} 1.6e+10
2873+
# HELP node_pcidevice_max_link_width Value of maximum link's width (number of lanes)
2874+
# TYPE node_pcidevice_max_link_width gauge
2875+
node_pcidevice_max_link_width{bus="00",device="02",function="1",segment="0000"} 8
2876+
node_pcidevice_max_link_width{bus="01",device="00",function="0",segment="0000"} 4
28572877
# HELP node_power_supply_capacity capacity value of /sys/class/power_supply/<power_supply>.
28582878
# TYPE node_power_supply_capacity gauge
28592879
node_power_supply_capacity{power_supply="BAT0"} 81
@@ -3013,6 +3033,7 @@ node_scrape_collector_success{collector="nfs"} 1
30133033
node_scrape_collector_success{collector="nfsd"} 1
30143034
node_scrape_collector_success{collector="nvme"} 1
30153035
node_scrape_collector_success{collector="os"} 1
3036+
node_scrape_collector_success{collector="pcidevice"} 1
30163037
node_scrape_collector_success{collector="powersupplyclass"} 1
30173038
node_scrape_collector_success{collector="pressure"} 1
30183039
node_scrape_collector_success{collector="processes"} 1

0 commit comments

Comments
 (0)