stolostron · clyang82 · Jun 11, 2021 · Jun 16, 2021
diff --git a/manifests/metrics-collector-template.yaml b/manifests/metrics-collector-template.yaml
@@ -0,0 +1,251 @@
+---
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: standalone-metrics-collector
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  annotations:
+    service.alpha.openshift.io/inject-cabundle: "true"
+  name: metrics-collector-serving-certs-ca-bundle
+  namespace: standalone-metrics-collector
+data:
+  service-ca.crt: ""
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: endpoint-observability-operator-sa
+  namespace: standalone-metrics-collector
+---
+kind: ClusterRoleBinding
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: standalone-metrics-collector
+subjects:
+  - kind: ServiceAccount
+    name: endpoint-observability-operator-sa
+    namespace: standalone-metrics-collector
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: cluster-monitoring-view
+---
+apiVersion: v1
+kind: Secret
+metadata:
+  name: observability-managed-cluster-certs
+  namespace: standalone-metrics-collector
+data:
+  ca.crt:
+    ${CA_CRT} 
+  tls.crt:
+    ${TLS_CRT} 
+  tls.key:
+    ${TLS_KEY} 
+type: Opaque
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: standalone-metrics-collector
+  namespace: standalone-metrics-collector
+spec:
+  progressDeadlineSeconds: 600
+  replicas: 1
+  revisionHistoryLimit: 10
+  selector:
+    matchLabels:
+      component: standalone-metrics-collector
+  strategy:
+    rollingUpdate:
+      maxSurge: 25%
+      maxUnavailable: 25%
+    type: RollingUpdate
+  template:
+    metadata:
+      creationTimestamp: null
+      labels:
+        component: standalone-metrics-collector
+    spec:
+      containers:
+      - command:
+        - /usr/bin/telemeter-client
+        - --from=$(FROM)
+        - --to-upload=$(TO)
+        - --from-ca-file=/etc/serving-certs-ca-bundle/service-ca.crt
+        - --from-token-file=/var/run/secrets/kubernetes.io/serviceaccount/token
+        - --interval=60s
+        - --label="cluster=${CLUSTER_NAME}"
+        - --label="clusterID=${CLUSTER_ID}"
+        - --limit-bytes=1073741824
+        - --match={__name__=":node_memory_MemAvailable_bytes:sum"}
+        - --match={__name__="ALERTS"}
+        - --match={__name__="apiserver_request_count"}
+        - --match={__name__="apiserver_request_latencies_summary_count"}
+        - --match={__name__="apiserver_request_latencies_summary_sum"}
+        - --match={__name__="authenticated_user_requests"}
+        - --match={__name__="authentication_attempts"}
+        - --match={__name__="cluster:capacity_cpu_cores:sum"}
+        - --match={__name__="cluster:capacity_memory_bytes:sum"}
+        - --match={__name__="cluster:container_cpu_usage:ratio"}
+        - --match={__name__="cluster:container_spec_cpu_shares:ratio"}
+        - --match={__name__="cluster:cpu_usage_cores:sum"}
+        - --match={__name__="cluster:memory_usage:ratio"}
+        - --match={__name__="cluster:memory_usage_bytes:sum"}
+        - --match={__name__="cluster:usage:resources:sum"}
+        - --match={__name__="cluster_infrastructure_provider"}
+        - --match={__name__="cluster_version"}
+        - --match={__name__="cluster_version_payload"}
+        - --match={__name__="container_cpu_cfs_periods_total"}
+        - --match={__name__="container_cpu_cfs_throttled_periods_total"}
+        - --match={__name__="container_spec_cpu_quota"}
+        - --match={__name__="coredns_dns_request_count_total"}
+        - --match={__name__="coredns_dns_request_duration_seconds_sum"}
+        - --match={__name__="coredns_dns_request_type_count_total"}
+        - --match={__name__="coredns_dns_response_rcode_count_total"}
+        - --match={__name__="etcd_debugging_mvcc_db_total_size_in_bytes"}
+        - --match={__name__="etcd_debugging_snap_save_total_duration_seconds_sum"}
+        - --match={__name__="etcd_disk_backend_commit_duration_seconds_bucket"}
+        - --match={__name__="etcd_disk_backend_commit_duration_seconds_sum"}
+        - --match={__name__="etcd_disk_wal_fsync_duration_seconds_bucket"}
+        - --match={__name__="etcd_disk_wal_fsync_duration_seconds_sum"}
+        - --match={__name__="etcd_object_counts"}
+        - --match={__name__="etcd_network_client_grpc_received_bytes_total"}
+        - --match={__name__="etcd_network_client_grpc_sent_bytes_total"}
+        - --match={__name__="etcd_network_peer_received_bytes_total"}
+        - --match={__name__="etcd_network_peer_sent_bytes_total"}
+        - --match={__name__="etcd_server_client_requests_total"}
+        - --match={__name__="etcd_server_has_leader"}
+        - --match={__name__="etcd_server_health_failures"}
+        - --match={__name__="etcd_server_leader_changes_seen_total"}
+        - --match={__name__="etcd_server_proposals_failed_total"}
+        - --match={__name__="etcd_server_proposals_pending"}
+        - --match={__name__="etcd_server_proposals_committed_total"}
+        - --match={__name__="etcd_server_proposals_applied_total"}
+        - --match={__name__="etcd_server_quota_backend_bytes"}
+        - --match={__name__="grpc_server_started_total"}
+        - --match={__name__="haproxy_backend_connection_errors_total"}
+        - --match={__name__="haproxy_backend_connections_total"}
+        - --match={__name__="haproxy_backend_current_queue"}
+        - --match={__name__="haproxy_backend_http_average_response_latency_milliseconds"}
+        - --match={__name__="haproxy_backend_max_sessions"}
+        - --match={__name__="haproxy_backend_response_errors_total"}
+        - --match={__name__="haproxy_backend_up"}
+        - --match={__name__="http_requests_total"}
+        - --match={__name__="instance:node_filesystem_usage:sum"}
+        - --match={__name__="instance:node_cpu_utilisation:rate1m"}
+        - --match={__name__="instance:node_load1_per_cpu:ratio"}
+        - --match={__name__="instance:node_memory_utilisation:ratio"}
+        - --match={__name__="instance:node_network_receive_bytes_excluding_lo:rate1m"}
+        - --match={__name__="instance:node_network_receive_drop_excluding_lo:rate1m"}
+        - --match={__name__="instance:node_network_transmit_bytes_excluding_lo:rate1m"}
+        - --match={__name__="instance:node_network_transmit_drop_excluding_lo:rate1m"}
+        - --match={__name__="instance:node_num_cpu:sum"}
+        - --match={__name__="instance:node_vmstat_pgmajfault:rate1m"}
+        - --match={__name__="instance_device:node_disk_io_time_seconds:rate1m"}
+        - --match={__name__="instance_device:node_disk_io_time_weighted_seconds:rate1m"}
+        - --match={__name__="kube_daemonset_status_desired_number_scheduled"}
+        - --match={__name__="kube_daemonset_status_number_unavailable"}
+        - --match={__name__="kube_node_spec_unschedulable"}
+        - --match={__name__="kube_node_status_allocatable"}
+        - --match={__name__="kube_node_status_allocatable_cpu_cores"}
+        - --match={__name__="kube_node_status_allocatable_memory_bytes"}
+        - --match={__name__="kube_node_status_capacity"}
+        - --match={__name__="kube_node_status_capacity_pods"}
+        - --match={__name__="kube_node_status_capacity_cpu_cores"}
+        - --match={__name__="kube_node_status_condition"}
+        - --match={__name__="kube_pod_container_resource_limits"}
+        - --match={__name__="kube_pod_container_resource_limits_cpu_cores"}
+        - --match={__name__="kube_pod_container_resource_limits_memory_bytes"}
+        - --match={__name__="kube_pod_container_resource_requests"}
+        - --match={__name__="kube_pod_container_resource_requests_cpu_cores"}
+        - --match={__name__="kube_pod_container_resource_requests_memory_bytes"}
+        - --match={__name__="kube_pod_info"}
+        - --match={__name__="kube_pod_owner"}
+        - --match={__name__="kube_resourcequota"}
+        - --match={__name__="kubelet_running_container_count"}
+        - --match={__name__="kubelet_runtime_operations"}
+        - --match={__name__="kubelet_runtime_operations_latency_microseconds"}
+        - --match={__name__="kubelet_volume_stats_available_bytes"}
+        - --match={__name__="kubelet_volume_stats_capacity_bytes"}
+        - --match={__name__="kube_persistentvolume_status_phase"}
+        - --match={__name__="machine_cpu_cores"}
+        - --match={__name__="machine_memory_bytes"}
+        - --match={__name__="mixin_pod_workload"}
+        - --match={__name__="namespace:kube_pod_container_resource_requests_cpu_cores:sum"}
+        - --match={__name__="namespace:kube_pod_container_resource_requests_memory_bytes:sum"}
+        - --match={__name__="namespace:container_memory_usage_bytes:sum"}
+        - --match={__name__="namespace_cpu:kube_pod_container_resource_requests:sum"}
+        - --match={__name__="namespace_workload_pod:kube_pod_owner:relabel"}
+        - --match={__name__="node_cpu_seconds_total"}
+        - --match={__name__="node_filesystem_avail_bytes"}
+        - --match={__name__="node_filesystem_free_bytes"}
+        - --match={__name__="node_filesystem_size_bytes"}
+        - --match={__name__="node_memory_MemAvailable_bytes"}
+        - --match={__name__="node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate"}
+        - --match={__name__="node_netstat_Tcp_OutSegs"}
+        - --match={__name__="node_netstat_Tcp_RetransSegs"}
+        - --match={__name__="node_netstat_TcpExt_TCPSynRetrans"}
+        - --match={__name__="up"}
+        - --match={__name__="cluster_monitoring_operator_reconcile_errors_total"}
+        - --match={__name__="cluster_monitoring_operator_reconcile_attempts_total"}
+        - --match={__name__="cluster_operator_conditions"}
+        - --match={__name__="cluster_operator_up"}
+        - --match={__name__="workqueue_queue_duration_seconds_bucket",job="apiserver"}
+        - --match={__name__="workqueue_adds_total",job="apiserver"}
+        - --match={__name__="workqueue_depth",job="apiserver"}
+        - --match={__name__="go_goroutines",job="apiserver"}
+        - --match={__name__="process_cpu_seconds_total",job="apiserver"}
+        - --match={__name__="process_resident_memory_bytes",job="apiserver"}
+        - --match={__name__="container_memory_cache",container!=""}
+        - --match={__name__="container_memory_rss",container!=""}
+        - --match={__name__="container_memory_swap",container!=""}
+        - --match={__name__="container_memory_working_set_bytes",container!=""}
+        - --rename="mixin_pod_workload=namespace_workload_pod:kube_pod_owner:relabel"
+        - --rename="namespace:kube_pod_container_resource_requests_cpu_cores:sum=namespace_cpu:kube_pod_container_resource_requests:sum"
+        - --recordingrule={"name":"apiserver_request_duration_seconds:histogram_quantile_99","query":"histogram_quantile(0.99,sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\", verb!=\"WATCH\"}[5m])) by (verb,le))"}
+        - --recordingrule={"name":"apiserver_request_duration_seconds:histogram_quantile_99:instance","query":"histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\", verb!=\"WATCH\"}[5m])) by (le, instance))"}
+        - --recordingrule={"name":"sum:apiserver_request_total:1h","query":"sum(rate(apiserver_request_total{job=\"apiserver\"}[1h])) by(code, instance)"}
+        - --recordingrule={"name":"sum:apiserver_request_total:5m","query":"sum(rate(apiserver_request_total{job=\"apiserver\"}[5m])) by(code, instance)"}
+        - --recordingrule={"name":"rpc_rate:grpc_server_handled_total:sum_rate","query":"sum(rate(grpc_server_handled_total{job=\"etcd\",grpc_type=\"unary\",grpc_code!=\"OK\"}[5m]))"}
+        - --recordingrule={"name":"active_streams_watch:grpc_server_handled_total:sum","query":"sum(grpc_server_started_total{job=\"etcd\",grpc_service=\"etcdserverpb.Watch\",grpc_type=\"bidi_stream\"}) - sum(grpc_server_handled_total{job=\"etcd\",grpc_service=\"etcdserverpb.Watch\",grpc_type=\"bidi_stream\"})"}
+        - --recordingrule={"name":"active_streams_lease:grpc_server_handled_total:sum","query":"sum(grpc_server_started_total{job=\"etcd\",grpc_service=\"etcdserverpb.Lease\",grpc_type=\"bidi_stream\"}) - sum(grpc_server_handled_total{job=\"etcd\",grpc_service=\"etcdserverpb.Lease\",grpc_type=\"bidi_stream\"})"}
+        env:
+        - name: FROM
+          value: https://prometheus-k8s.openshift-monitoring.svc:9091
+        - name: TO
+          value: ${OBSERVATORIUM_RECEIVER_URL}
+        - name: STANDALONE
+          value: "true"
+        image: ${METRICS_COLLECTOR_IMAGE}
+        imagePullPolicy: Always
+        name: metrics-collector
+        resources: {}
+        terminationMessagePath: /dev/termination-log
+        terminationMessagePolicy: File
+        volumeMounts:
+        - mountPath: /tlscerts/ca
+          name: observability-managed-cluster-certs
+        - mountPath: /tlscerts/certs
+          name: observability-managed-cluster-certs
+        - mountPath: /etc/serving-certs-ca-bundle
+          name: serving-certs-ca-bundle
+      dnsPolicy: ClusterFirst
+      restartPolicy: Always
+      schedulerName: default-scheduler
+      securityContext: {}
+      serviceAccount: endpoint-observability-operator-sa
+      serviceAccountName: endpoint-observability-operator-sa
+      terminationGracePeriodSeconds: 30
+      volumes:
+      - name: observability-managed-cluster-certs
+        secret:
+          defaultMode: 420
+          secretName: observability-managed-cluster-certs
+      - configMap:
+          defaultMode: 420
+          name: metrics-collector-serving-certs-ca-bundle
+        name: serving-certs-ca-bundle