Skip to content
This repository was archived by the owner on Oct 17, 2024. It is now read-only.
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
251 changes: 251 additions & 0 deletions manifests/metrics-collector-template.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,251 @@
---
apiVersion: v1
kind: Namespace
metadata:
name: standalone-metrics-collector
---
apiVersion: v1
kind: ConfigMap
metadata:
annotations:
service.alpha.openshift.io/inject-cabundle: "true"
name: metrics-collector-serving-certs-ca-bundle
namespace: standalone-metrics-collector
data:
service-ca.crt: ""
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: endpoint-observability-operator-sa
namespace: standalone-metrics-collector
---
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: standalone-metrics-collector
subjects:
- kind: ServiceAccount
name: endpoint-observability-operator-sa
namespace: standalone-metrics-collector
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: cluster-monitoring-view
---
apiVersion: v1
kind: Secret
metadata:
name: observability-managed-cluster-certs
namespace: standalone-metrics-collector
data:
ca.crt:
${CA_CRT}
tls.crt:
${TLS_CRT}
tls.key:
${TLS_KEY}
type: Opaque
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: standalone-metrics-collector
namespace: standalone-metrics-collector
spec:
progressDeadlineSeconds: 600
replicas: 1
revisionHistoryLimit: 10
selector:
matchLabels:
component: standalone-metrics-collector
strategy:
rollingUpdate:
maxSurge: 25%
maxUnavailable: 25%
type: RollingUpdate
template:
metadata:
creationTimestamp: null
labels:
component: standalone-metrics-collector
spec:
containers:
- command:
- /usr/bin/telemeter-client
- --from=$(FROM)
- --to-upload=$(TO)
- --from-ca-file=/etc/serving-certs-ca-bundle/service-ca.crt
- --from-token-file=/var/run/secrets/kubernetes.io/serviceaccount/token
- --interval=60s
- --label="cluster=${CLUSTER_NAME}"
- --label="clusterID=${CLUSTER_ID}"
- --limit-bytes=1073741824
- --match={__name__=":node_memory_MemAvailable_bytes:sum"}
- --match={__name__="ALERTS"}
- --match={__name__="apiserver_request_count"}
- --match={__name__="apiserver_request_latencies_summary_count"}
- --match={__name__="apiserver_request_latencies_summary_sum"}
- --match={__name__="authenticated_user_requests"}
- --match={__name__="authentication_attempts"}
- --match={__name__="cluster:capacity_cpu_cores:sum"}
- --match={__name__="cluster:capacity_memory_bytes:sum"}
- --match={__name__="cluster:container_cpu_usage:ratio"}
- --match={__name__="cluster:container_spec_cpu_shares:ratio"}
- --match={__name__="cluster:cpu_usage_cores:sum"}
- --match={__name__="cluster:memory_usage:ratio"}
- --match={__name__="cluster:memory_usage_bytes:sum"}
- --match={__name__="cluster:usage:resources:sum"}
- --match={__name__="cluster_infrastructure_provider"}
- --match={__name__="cluster_version"}
- --match={__name__="cluster_version_payload"}
- --match={__name__="container_cpu_cfs_periods_total"}
- --match={__name__="container_cpu_cfs_throttled_periods_total"}
- --match={__name__="container_spec_cpu_quota"}
- --match={__name__="coredns_dns_request_count_total"}
- --match={__name__="coredns_dns_request_duration_seconds_sum"}
- --match={__name__="coredns_dns_request_type_count_total"}
- --match={__name__="coredns_dns_response_rcode_count_total"}
- --match={__name__="etcd_debugging_mvcc_db_total_size_in_bytes"}
- --match={__name__="etcd_debugging_snap_save_total_duration_seconds_sum"}
- --match={__name__="etcd_disk_backend_commit_duration_seconds_bucket"}
- --match={__name__="etcd_disk_backend_commit_duration_seconds_sum"}
- --match={__name__="etcd_disk_wal_fsync_duration_seconds_bucket"}
- --match={__name__="etcd_disk_wal_fsync_duration_seconds_sum"}
- --match={__name__="etcd_object_counts"}
- --match={__name__="etcd_network_client_grpc_received_bytes_total"}
- --match={__name__="etcd_network_client_grpc_sent_bytes_total"}
- --match={__name__="etcd_network_peer_received_bytes_total"}
- --match={__name__="etcd_network_peer_sent_bytes_total"}
- --match={__name__="etcd_server_client_requests_total"}
- --match={__name__="etcd_server_has_leader"}
- --match={__name__="etcd_server_health_failures"}
- --match={__name__="etcd_server_leader_changes_seen_total"}
- --match={__name__="etcd_server_proposals_failed_total"}
- --match={__name__="etcd_server_proposals_pending"}
- --match={__name__="etcd_server_proposals_committed_total"}
- --match={__name__="etcd_server_proposals_applied_total"}
- --match={__name__="etcd_server_quota_backend_bytes"}
- --match={__name__="grpc_server_started_total"}
- --match={__name__="haproxy_backend_connection_errors_total"}
- --match={__name__="haproxy_backend_connections_total"}
- --match={__name__="haproxy_backend_current_queue"}
- --match={__name__="haproxy_backend_http_average_response_latency_milliseconds"}
- --match={__name__="haproxy_backend_max_sessions"}
- --match={__name__="haproxy_backend_response_errors_total"}
- --match={__name__="haproxy_backend_up"}
- --match={__name__="http_requests_total"}
- --match={__name__="instance:node_filesystem_usage:sum"}
- --match={__name__="instance:node_cpu_utilisation:rate1m"}
- --match={__name__="instance:node_load1_per_cpu:ratio"}
- --match={__name__="instance:node_memory_utilisation:ratio"}
- --match={__name__="instance:node_network_receive_bytes_excluding_lo:rate1m"}
- --match={__name__="instance:node_network_receive_drop_excluding_lo:rate1m"}
- --match={__name__="instance:node_network_transmit_bytes_excluding_lo:rate1m"}
- --match={__name__="instance:node_network_transmit_drop_excluding_lo:rate1m"}
- --match={__name__="instance:node_num_cpu:sum"}
- --match={__name__="instance:node_vmstat_pgmajfault:rate1m"}
- --match={__name__="instance_device:node_disk_io_time_seconds:rate1m"}
- --match={__name__="instance_device:node_disk_io_time_weighted_seconds:rate1m"}
- --match={__name__="kube_daemonset_status_desired_number_scheduled"}
- --match={__name__="kube_daemonset_status_number_unavailable"}
- --match={__name__="kube_node_spec_unschedulable"}
- --match={__name__="kube_node_status_allocatable"}
- --match={__name__="kube_node_status_allocatable_cpu_cores"}
- --match={__name__="kube_node_status_allocatable_memory_bytes"}
- --match={__name__="kube_node_status_capacity"}
- --match={__name__="kube_node_status_capacity_pods"}
- --match={__name__="kube_node_status_capacity_cpu_cores"}
- --match={__name__="kube_node_status_condition"}
- --match={__name__="kube_pod_container_resource_limits"}
- --match={__name__="kube_pod_container_resource_limits_cpu_cores"}
- --match={__name__="kube_pod_container_resource_limits_memory_bytes"}
- --match={__name__="kube_pod_container_resource_requests"}
- --match={__name__="kube_pod_container_resource_requests_cpu_cores"}
- --match={__name__="kube_pod_container_resource_requests_memory_bytes"}
- --match={__name__="kube_pod_info"}
- --match={__name__="kube_pod_owner"}
- --match={__name__="kube_resourcequota"}
- --match={__name__="kubelet_running_container_count"}
- --match={__name__="kubelet_runtime_operations"}
- --match={__name__="kubelet_runtime_operations_latency_microseconds"}
- --match={__name__="kubelet_volume_stats_available_bytes"}
- --match={__name__="kubelet_volume_stats_capacity_bytes"}
- --match={__name__="kube_persistentvolume_status_phase"}
- --match={__name__="machine_cpu_cores"}
- --match={__name__="machine_memory_bytes"}
- --match={__name__="mixin_pod_workload"}
- --match={__name__="namespace:kube_pod_container_resource_requests_cpu_cores:sum"}
- --match={__name__="namespace:kube_pod_container_resource_requests_memory_bytes:sum"}
- --match={__name__="namespace:container_memory_usage_bytes:sum"}
- --match={__name__="namespace_cpu:kube_pod_container_resource_requests:sum"}
- --match={__name__="namespace_workload_pod:kube_pod_owner:relabel"}
- --match={__name__="node_cpu_seconds_total"}
- --match={__name__="node_filesystem_avail_bytes"}
- --match={__name__="node_filesystem_free_bytes"}
- --match={__name__="node_filesystem_size_bytes"}
- --match={__name__="node_memory_MemAvailable_bytes"}
- --match={__name__="node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate"}
- --match={__name__="node_netstat_Tcp_OutSegs"}
- --match={__name__="node_netstat_Tcp_RetransSegs"}
- --match={__name__="node_netstat_TcpExt_TCPSynRetrans"}
- --match={__name__="up"}
- --match={__name__="cluster_monitoring_operator_reconcile_errors_total"}
- --match={__name__="cluster_monitoring_operator_reconcile_attempts_total"}
- --match={__name__="cluster_operator_conditions"}
- --match={__name__="cluster_operator_up"}
- --match={__name__="workqueue_queue_duration_seconds_bucket",job="apiserver"}
- --match={__name__="workqueue_adds_total",job="apiserver"}
- --match={__name__="workqueue_depth",job="apiserver"}
- --match={__name__="go_goroutines",job="apiserver"}
- --match={__name__="process_cpu_seconds_total",job="apiserver"}
- --match={__name__="process_resident_memory_bytes",job="apiserver"}
- --match={__name__="container_memory_cache",container!=""}
- --match={__name__="container_memory_rss",container!=""}
- --match={__name__="container_memory_swap",container!=""}
- --match={__name__="container_memory_working_set_bytes",container!=""}
- --rename="mixin_pod_workload=namespace_workload_pod:kube_pod_owner:relabel"
- --rename="namespace:kube_pod_container_resource_requests_cpu_cores:sum=namespace_cpu:kube_pod_container_resource_requests:sum"
- --recordingrule={"name":"apiserver_request_duration_seconds:histogram_quantile_99","query":"histogram_quantile(0.99,sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\", verb!=\"WATCH\"}[5m])) by (verb,le))"}
- --recordingrule={"name":"apiserver_request_duration_seconds:histogram_quantile_99:instance","query":"histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\", verb!=\"WATCH\"}[5m])) by (le, instance))"}
- --recordingrule={"name":"sum:apiserver_request_total:1h","query":"sum(rate(apiserver_request_total{job=\"apiserver\"}[1h])) by(code, instance)"}
- --recordingrule={"name":"sum:apiserver_request_total:5m","query":"sum(rate(apiserver_request_total{job=\"apiserver\"}[5m])) by(code, instance)"}
- --recordingrule={"name":"rpc_rate:grpc_server_handled_total:sum_rate","query":"sum(rate(grpc_server_handled_total{job=\"etcd\",grpc_type=\"unary\",grpc_code!=\"OK\"}[5m]))"}
- --recordingrule={"name":"active_streams_watch:grpc_server_handled_total:sum","query":"sum(grpc_server_started_total{job=\"etcd\",grpc_service=\"etcdserverpb.Watch\",grpc_type=\"bidi_stream\"}) - sum(grpc_server_handled_total{job=\"etcd\",grpc_service=\"etcdserverpb.Watch\",grpc_type=\"bidi_stream\"})"}
- --recordingrule={"name":"active_streams_lease:grpc_server_handled_total:sum","query":"sum(grpc_server_started_total{job=\"etcd\",grpc_service=\"etcdserverpb.Lease\",grpc_type=\"bidi_stream\"}) - sum(grpc_server_handled_total{job=\"etcd\",grpc_service=\"etcdserverpb.Lease\",grpc_type=\"bidi_stream\"})"}
env:
- name: FROM
value: https://prometheus-k8s.openshift-monitoring.svc:9091
- name: TO
value: ${OBSERVATORIUM_RECEIVER_URL}
- name: STANDALONE
value: "true"
image: ${METRICS_COLLECTOR_IMAGE}
imagePullPolicy: Always
name: metrics-collector
resources: {}
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts:
- mountPath: /tlscerts/ca
name: observability-managed-cluster-certs
- mountPath: /tlscerts/certs
name: observability-managed-cluster-certs
- mountPath: /etc/serving-certs-ca-bundle
name: serving-certs-ca-bundle
dnsPolicy: ClusterFirst
restartPolicy: Always
schedulerName: default-scheduler
securityContext: {}
serviceAccount: endpoint-observability-operator-sa
serviceAccountName: endpoint-observability-operator-sa
terminationGracePeriodSeconds: 30
volumes:
- name: observability-managed-cluster-certs
secret:
defaultMode: 420
secretName: observability-managed-cluster-certs
- configMap:
defaultMode: 420
name: metrics-collector-serving-certs-ca-bundle
name: serving-certs-ca-bundle