Skip to content

Commit 69d0e7d

Browse files
committed
Add node metrics
1 parent f51084f commit 69d0e7d

File tree

5 files changed

+362
-239
lines changed

5 files changed

+362
-239
lines changed

pkg/agent/agent.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"reflect"
1111
"time"
1212

13+
"github.com/prometheus/client_golang/prometheus"
1314
corev1 "k8s.io/api/core/v1"
1415
"k8s.io/apimachinery/pkg/api/errors"
1516
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -124,6 +125,7 @@ func NewAgent(ctx context.Context,
124125
if err != nil {
125126
return agent, err
126127
}
128+
prometheus.MustRegister(metrics.NewNodeResourceCollector(nodeName, nodeInformer.Lister(), nodeResourceManager.GetResource))
127129
managers = appendManagerIfNotNil(managers, nodeResourceManager)
128130
}
129131

@@ -134,6 +136,8 @@ func NewAgent(ctx context.Context,
134136

135137
agent.managers = managers
136138

139+
prometheus.MustRegister(metrics.NewPodResourceCollector(podInformer.Lister()))
140+
137141
return agent, nil
138142
}
139143

pkg/known/types.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
package known
22

3+
import "k8s.io/apimachinery/pkg/api/resource"
4+
35
type Module string
46

57
const (
@@ -9,3 +11,16 @@ const (
911
ModuleNodeResourceManager Module = "ModuleNodeResourceManager"
1012
ModulePodResourceManager Module = "ModulePodResourceManager"
1113
)
14+
15+
type ResourceStatus struct {
16+
CPUReserved *resource.Quantity
17+
CPUUsage *resource.Quantity
18+
CPUUsageOffline *resource.Quantity
19+
CPUSetIdle *resource.Quantity
20+
MemoryReserved *resource.Quantity
21+
MemoryUsage *resource.Quantity
22+
MemoryUsageOffline *resource.Quantity
23+
24+
CPUReservedTSP *resource.Quantity
25+
MemoryReservedTSP *resource.Quantity
26+
}

pkg/known/vars.go

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
package known
22

3-
import "os"
3+
import (
4+
"os"
5+
6+
corev1 "k8s.io/api/core/v1"
7+
)
48

59
var (
610
CraneSystemNamespace = "crane-system"
@@ -11,3 +15,13 @@ func init() {
1115
CraneSystemNamespace = namespace
1216
}
1317
}
18+
19+
const (
20+
// ElasticResourcePrefix is crane resource namespace prefix.
21+
ElasticResourcePrefix = "gocrane.io/"
22+
)
23+
24+
var (
25+
ElasticCPU = ElasticResourcePrefix + corev1.ResourceCPU
26+
ElasticMemory = ElasticResourcePrefix + corev1.ResourceMemory
27+
)

pkg/metrics/node.go

Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
package metrics
2+
3+
import (
4+
"github.com/gocrane/crane/pkg/known"
5+
"github.com/prometheus/client_golang/prometheus"
6+
"k8s.io/apimachinery/pkg/api/resource"
7+
"k8s.io/apimachinery/pkg/labels"
8+
v1 "k8s.io/client-go/listers/core/v1"
9+
"k8s.io/klog/v2"
10+
)
11+
12+
const (
13+
CraneNodeSubsystem = "node"
14+
CranePodSubsystem = "pod"
15+
)
16+
17+
var (
18+
podElasticCPUDesc = prometheus.NewDesc("crane_pod_elastic_cpu_request",
19+
"The elastic cpu requested by pod",
20+
[]string{"pod", "namespace"},
21+
nil,
22+
)
23+
podElasticMemoryDesc = prometheus.NewDesc("crane_pod_elastic_memory_request",
24+
"The elastic cpu requested by pod",
25+
[]string{"pod", "namespace"},
26+
nil,
27+
)
28+
)
29+
30+
func NewPodResourceCollector(podLister v1.PodLister) *PodResourceCollector {
31+
return &PodResourceCollector{
32+
podLister: podLister,
33+
}
34+
}
35+
36+
type PodResourceCollector struct {
37+
podLister v1.PodLister
38+
}
39+
40+
func (n *PodResourceCollector) Describe(descs chan<- *prometheus.Desc) {
41+
descs <- podElasticCPUDesc
42+
descs <- podElasticMemoryDesc
43+
}
44+
45+
func (n *PodResourceCollector) Collect(metrics chan<- prometheus.Metric) {
46+
pods, err := n.podLister.List(labels.Everything())
47+
if err != nil {
48+
klog.ErrorS(err, "list pods failed")
49+
return
50+
}
51+
52+
for _, pod := range pods {
53+
eCPU, eMemory := resource.NewQuantity(0, resource.DecimalSI), resource.NewQuantity(0, resource.BinarySI)
54+
for _, container := range pod.Spec.Containers {
55+
eCPU.Add(*container.Resources.Requests.Name(known.ElasticCPU, resource.DecimalSI))
56+
eMemory.Add(*container.Resources.Requests.Name(known.ElasticCPU, resource.DecimalSI))
57+
}
58+
if eCPU.IsZero() && eMemory.IsZero() {
59+
continue
60+
}
61+
metrics <- prometheus.MustNewConstMetric(podElasticCPUDesc, prometheus.GaugeValue, eCPU.AsApproximateFloat64(), pod.Name, pod.Namespace)
62+
metrics <- prometheus.MustNewConstMetric(podElasticMemoryDesc, prometheus.GaugeValue, eMemory.AsApproximateFloat64(), pod.Name, pod.Namespace)
63+
}
64+
}
65+
66+
var (
67+
nodeElasticCPUDesc = prometheus.NewDesc("crane_node_elastic_cpu_allocatable",
68+
"The elastic cpu of the node.",
69+
[]string{"node"},
70+
nil,
71+
)
72+
nodeElasticMemoryDesc = prometheus.NewDesc("crane_node_elastic_memory_allocatable",
73+
"The elastic memory requested by pod",
74+
[]string{"node"},
75+
nil,
76+
)
77+
nodeCPUAllocatableDesc = prometheus.NewDesc("crane_node_cpu_allocatable",
78+
"The cpu allocatable of the node.",
79+
[]string{"node"},
80+
nil,
81+
)
82+
nodeCPUCapacityDesc = prometheus.NewDesc("crane_node_cpu_capacity",
83+
"The cpu capacity of the node.",
84+
[]string{"node"},
85+
nil,
86+
)
87+
nodeMemoryAllocatableDesc = prometheus.NewDesc("crane_node_memory_allocatable",
88+
"The memory allocatable requested by pod",
89+
[]string{"node"},
90+
nil,
91+
)
92+
nodeMemoryCapacityDesc = prometheus.NewDesc("crane_node_memory_capacity",
93+
"The memory capacity requested by pod",
94+
[]string{"node"},
95+
nil,
96+
)
97+
nodeCPUReservedDesc = prometheus.NewDesc("crane_node_cpu_reserved",
98+
"The reserved cpu of node",
99+
[]string{"node"},
100+
nil)
101+
nodeCPUUsageOnlineDesc = prometheus.NewDesc("crane_node_cpu_usage_online",
102+
"The online cpu usage of node",
103+
[]string{"node"},
104+
nil)
105+
nodeCPUUsageOfflineDesc = prometheus.NewDesc("crane_node_cpu_usage_offline",
106+
"The offline cpu usage of node",
107+
[]string{"node"},
108+
nil)
109+
nodeMemoryReservedDesc = prometheus.NewDesc("crane_node_memory_reserved",
110+
"The reserved memory of node",
111+
[]string{"node"},
112+
nil)
113+
nodeMemoryUsageOnlineDesc = prometheus.NewDesc("crane_node_memory_usage_online",
114+
"The online memory usage of node",
115+
[]string{"node"},
116+
nil)
117+
nodeMemoryUsageOfflineDesc = prometheus.NewDesc("crane_node_memory_usage_offline",
118+
"The offline memory usage of node",
119+
[]string{"node"},
120+
nil)
121+
)
122+
123+
type NodeResourceCollector struct {
124+
nodeName string
125+
nodeLister v1.NodeLister
126+
nodeResourceGetter func() *known.ResourceStatus
127+
}
128+
129+
func NewNodeResourceCollector(nodeName string, nodeLister v1.NodeLister, nodeResourceGetter func() *known.ResourceStatus) *NodeResourceCollector {
130+
return &NodeResourceCollector{
131+
nodeName: nodeName,
132+
nodeLister: nodeLister,
133+
nodeResourceGetter: nodeResourceGetter,
134+
}
135+
}
136+
137+
func (n *NodeResourceCollector) Describe(descs chan<- *prometheus.Desc) {
138+
// resource metrics from status of node
139+
descs <- nodeElasticCPUDesc
140+
descs <- nodeElasticMemoryDesc
141+
descs <- nodeCPUAllocatableDesc
142+
descs <- nodeCPUCapacityDesc
143+
descs <- nodeMemoryAllocatableDesc
144+
descs <- nodeMemoryCapacityDesc
145+
146+
// usage metrics
147+
descs <- nodeCPUReservedDesc
148+
descs <- nodeCPUUsageOnlineDesc
149+
descs <- nodeCPUUsageOfflineDesc
150+
descs <- nodeMemoryReservedDesc
151+
descs <- nodeMemoryUsageOnlineDesc
152+
descs <- nodeMemoryUsageOfflineDesc
153+
}
154+
155+
func (n *NodeResourceCollector) Collect(metrics chan<- prometheus.Metric) {
156+
node, err := n.nodeLister.Get(n.nodeName)
157+
if err != nil {
158+
klog.ErrorS(err, "list pods failed")
159+
return
160+
}
161+
metrics <- prometheus.MustNewConstMetric(nodeElasticCPUDesc, prometheus.GaugeValue, node.Status.Allocatable.Name(known.ElasticCPU, resource.DecimalSI).AsApproximateFloat64(), node.Name)
162+
metrics <- prometheus.MustNewConstMetric(nodeElasticMemoryDesc, prometheus.GaugeValue, node.Status.Allocatable.Name(known.ElasticMemory, resource.BinarySI).AsApproximateFloat64(), node.Name)
163+
metrics <- prometheus.MustNewConstMetric(nodeCPUAllocatableDesc, prometheus.GaugeValue, node.Status.Allocatable.Cpu().AsApproximateFloat64(), node.Name)
164+
metrics <- prometheus.MustNewConstMetric(nodeMemoryAllocatableDesc, prometheus.GaugeValue, node.Status.Allocatable.Memory().AsApproximateFloat64(), node.Name)
165+
metrics <- prometheus.MustNewConstMetric(nodeCPUCapacityDesc, prometheus.GaugeValue, node.Status.Capacity.Cpu().AsApproximateFloat64(), node.Name)
166+
metrics <- prometheus.MustNewConstMetric(nodeMemoryCapacityDesc, prometheus.GaugeValue, node.Status.Capacity.Memory().AsApproximateFloat64(), node.Name)
167+
168+
resourceStatus := n.nodeResourceGetter()
169+
if resourceStatus == nil {
170+
return
171+
}
172+
metrics <- prometheus.MustNewConstMetric(nodeCPUReservedDesc, prometheus.GaugeValue, resourceStatus.CPUReserved.AsApproximateFloat64(), node.Name)
173+
// TODO incorrect online define !!
174+
metrics <- prometheus.MustNewConstMetric(nodeCPUUsageOnlineDesc, prometheus.GaugeValue, resourceStatus.CPUUsage.AsApproximateFloat64(), node.Name)
175+
metrics <- prometheus.MustNewConstMetric(nodeCPUUsageOfflineDesc, prometheus.GaugeValue, resourceStatus.CPUUsageOffline.AsApproximateFloat64(), node.Name)
176+
177+
metrics <- prometheus.MustNewConstMetric(nodeMemoryReservedDesc, prometheus.GaugeValue, resourceStatus.MemoryReserved.AsApproximateFloat64(), node.Name)
178+
// TODO incorrect online define !!
179+
metrics <- prometheus.MustNewConstMetric(nodeMemoryUsageOnlineDesc, prometheus.GaugeValue, resourceStatus.MemoryUsage.AsApproximateFloat64(), node.Name)
180+
metrics <- prometheus.MustNewConstMetric(nodeMemoryUsageOfflineDesc, prometheus.GaugeValue, resourceStatus.MemoryUsageOffline.AsApproximateFloat64(), node.Name)
181+
182+
}

0 commit comments

Comments
 (0)