diff --git a/procstat/.gitignore b/procstat/.gitignore new file mode 100644 index 0000000000..c0be3de0b5 --- /dev/null +++ b/procstat/.gitignore @@ -0,0 +1,4 @@ +procstatenv/ +dist/ +__pycache__/ +build/ \ No newline at end of file diff --git a/procstat/README.md b/procstat/README.md new file mode 100644 index 0000000000..9457d119c4 --- /dev/null +++ b/procstat/README.md @@ -0,0 +1,141 @@ +# Proc stat sampler + +## Run tests + +The following command runs a couple of unit tests for the proc stat sampling system. + +```bash +python3 -m unittest discover -v tests/ +``` + +## Using the sampler + +``` +./main.py --help +usage: main.py [-h] [--sample-frequency SAMPLE_FREQUENCY] + [--track-proc-name [TRACK_PROC_NAME [TRACK_PROC_NAME ...]]] + [--dump-path DUMP_PATH] + +Proc stat sampler CLI + +optional arguments: + -h, --help show this help message and exit + --sample-frequency SAMPLE_FREQUENCY + Number of samples to obtain per second. Defaults to 1 + per second. + --track-proc-name [TRACK_PROC_NAME [TRACK_PROC_NAME ...]] + Process name(s) to track, if any. Multiple allowed. + --dump-path DUMP_PATH + Path where the result will be written. +``` + +# Transform a dump from the sampler to yaml + +``` +oschaaf@burst:~/code/istio/tools/procstat$ ./dump-to-yaml.py --help +usage: dump-to-yaml.py [-h] [--dump-path DUMP_PATH] + +Transforms dumps from the sampler to yaml + +optional arguments: + -h, --help show this help message and exit + --dump-path DUMP_PATH + Path where the target dump resides. +``` + +### Sample output: + +``` +- cpu_percent: 2.4 + cpu_times: + guest: 0.0 + guest_nice: 0.0 + idle: 8788185.5 + iowait: 2188.63 + irq: 0.0 + nice: 19.19 + softirq: 14.13 + steal: 0.0 + system: 765.38 + user: 5233.24 + processes: [] + timestamp: 1581979800.9612823 +- cpu_percent: 0.0 + cpu_times: + guest: 0.0 + guest_nice: 0.0 + idle: 8788225.51 + iowait: 2188.63 + irq: 0.0 + nice: 19.19 + softirq: 14.13 + steal: 0.0 + system: 765.38 + user: 5233.25 + processes: [] + timestamp: 1581979801.9625692 +- cpu_percent: 0.0 + cpu_times: + guest: 0.0 + guest_nice: 0.0 + idle: 8788265.53 + iowait: 2188.63 + irq: 0.0 + nice: 19.19 + softirq: 14.13 + steal: 0.0 + system: 765.38 + user: 5233.25 + processes: [] + timestamp: 1581979802.963791 +``` + +## Expose the proc stat sampler output for prometheus scraping + +```bash +# run in a separate terminal +./prom.py --track nginx envoy --http-port 8000 +``` + +# Querying the statistics + +Note: output is from an early version, which only tracked internal metrics from the prometheus client lib). + +```bash +curl --silent 127.00.1:8000 | head + +oschaaf@burst:~/code/istio/tools/procstat$ curl --silent 127.0.0.1:8000 | head +# HELP python_gc_objects_collected_total Objects collected during gc +# TYPE python_gc_objects_collected_total counter +python_gc_objects_collected_total{generation="0"} 123.0 +python_gc_objects_collected_total{generation="1"} 255.0 +python_gc_objects_collected_total{generation="2"} 0.0 +# HELP python_gc_objects_uncollectable_total Uncollectable object found during GC +# TYPE python_gc_objects_uncollectable_total counter +python_gc_objects_uncollectable_total{generation="0"} 0.0 +python_gc_objects_uncollectable_total{generation="1"} 0.0 +python_gc_objects_uncollectable_total{generation="2"} 0.0 + +``` + +## Exposing prometheus metrics in side car proxy containers. + +The following script will build a standalone binary, deploy it to the benchmark +side car proxy containers, and fire up the service. + +```bash +NAMESPACE=twopods-istio ./install_to_container.sh +``` + +## Testing if the service is running in containers + +The service will listen on port 8000 by default. Hence querying that port with curl ought to output a bunch of counters in prometheus format. + +``` bash +kubectl --namespace twopods-istio exec fortioclient-6b58bf5799-hkq8l -c istio-proxy curl 127.0.0.1:8000 + +... +cpu_times_system 6217.48 +... +``` + diff --git a/procstat/dump-to-yaml.py b/procstat/dump-to-yaml.py new file mode 100755 index 0000000000..8f478e8d26 --- /dev/null +++ b/procstat/dump-to-yaml.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python3 + +import argparse +from modules.collector import Collector +from modules.yaml_formatter import to_yaml +import sys +import time +import os + +def main(): + parser = argparse.ArgumentParser(description='Transforms dumps from the sampler to yaml') + parser.add_argument("--dump-path", type=str, help='Path where the target dump resides.') + args = parser.parse_args() + with open(args.dump_path, "rb") as file: + collector = Collector(file=None) + yaml = to_yaml(list(collector.read_dump(file))) + print(yaml) + + +if __name__ == "__main__": + main() diff --git a/procstat/install_to_container.sh b/procstat/install_to_container.sh new file mode 100755 index 0000000000..7465039222 --- /dev/null +++ b/procstat/install_to_container.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +set +x +set -e + +echo "Building standalone binary" + +export NAMESPACE=${NAMESPACE:-'twopods-istio'} + +if [ ! -d dist/ ] +then + rm requirements.txt || true + python3 -m venv procstatenv + source procstatenv/bin/activate + pip3 install prometheus-client psutil + # We strip a line because of a bug in pip freeze + pip freeze | grep -v "pkg-resources" > requirements.txt + # We build on a docker to make sure we produce a compatible binary + # (we need to make sure to build it with a compatible glibc version) + # TODO(oschaaf): is it OK to use this docker image? + docker run -v "$(pwd):/src/" cdrx/pyinstaller-linux:python3 "pyinstaller prom.py" +fi + +echo "Deploying standalone binary" + +kubectl get pods --namespace twopods-istio --no-headers --field-selector=status.phase=Running -o name | while read pod +do + # Strip the pod/ prefix we get for free + pod=${pod#"pod/"} + echo "Installing to ${pod}" + kubectl --namespace ${NAMESPACE} exec ${pod} -c istio-proxy -- rm -rf /etc/istio/proxy/procstat + kubectl --namespace ${NAMESPACE} cp ./ ${pod}:/etc/istio/proxy/procstat -c istio-proxy + echo "Fire service in ${pod}" + # Stop the existing service instance, if any + kubectl --namespace ${NAMESPACE} exec ${pod} -c istio-proxy -- pkill -f prom || true + # Fix, this neesd the kubectl command to stay running on the machine running this script + kubectl --namespace ${NAMESPACE} exec ${pod} -c istio-proxy /etc/istio/proxy/procstat/dist/prom/prom & +done + +echo "proc stat sampling deployed" + + diff --git a/procstat/modules/collector.py b/procstat/modules/collector.py new file mode 100644 index 0000000000..7adfbae7a4 --- /dev/null +++ b/procstat/modules/collector.py @@ -0,0 +1,43 @@ +from pickle import Pickler, Unpickler +from threading import Thread +from time import sleep +from modules.sampler import Sampler +import os +import tempfile + + +class Collector: + def __init__(self, file, sampler=Sampler(), sample_interval=1.0): + self.file = file + self.sample_interval = sample_interval + self.thread = Thread(target=self.work) + self.sampler = sampler + + def work(self): + while self.running: + self.pickler.dump(self.sampler.get_snapshot()) + # We clear this so the pickler won't remember which objects + # it has already seen. This allows us to restore flattened + # process structured, thereby serializing a flattened version + # into yaml. + self.pickler.clear_memo() + sleep(self.sample_interval) + + def start(self): + self.running = True + self.pickler = Pickler(self.file) + self.thread.start() + + def stop(self): + self.running = False + self.thread.join() + self.file.close() + + def read_dump(self, file): + unpickler = Unpickler(file) + done = False + while not done: + try: + yield unpickler.load() + except EOFError: + done = True diff --git a/procstat/modules/prometheus_http.py b/procstat/modules/prometheus_http.py new file mode 100644 index 0000000000..45837eee0c --- /dev/null +++ b/procstat/modules/prometheus_http.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python3 + +from prometheus_client import start_http_server, Histogram, Summary, Gauge +import random +import time +from os import pipe, fdopen +from signal import signal, SIGINT, SIGTERM +from argparse import ArgumentParser +from modules.sampler import Sampler +from modules.collector import Collector + +global COLLECTOR + + +def signal_handler(a, b): + ''' + We will gracefully quit upon observing SIGTERM/SIGINT. + We do so by calling stop on the collector, which in turn will end up + closing the write side of the pipe that it is writing to. + This will be noticed by the code below, which processes the read side. + ''' + print("stopping... ") + global COLLECTOR + COLLECTOR.stop() + + +class Prom: + def __init__(self): + pass + + def run(self, arguments): + parser = ArgumentParser(description='Proc stat sampler CLI') + parser.add_argument("--track-proc-name", type=str, nargs="*", + help='Optional process name(s) to track.', default=[]) + parser.add_argument("--sample-frequency", type=int, default=1, + help='Number of samples to obtain per second.') + parser.add_argument("--http-port", type=int, default=8000, + help='Http port for exposing prometheus metrics.') + + args = parser.parse_args(arguments) + + signal(SIGINT, signal_handler) + signal(SIGTERM, signal_handler) + + global COLLECTOR + # We hand the write side of the pipe to our proc stat collector. + pipe_read_fd, pipe_write_fd = pipe() + COLLECTOR = Collector(fdopen(pipe_write_fd, "wb", 1024), sampler=Sampler( + process_names_of_interest=args.track_proc_name), sample_interval=1.0/args.sample_frequency) + # Start serving prometheus stats over http + start_http_server(args.http_port) + + # Start sampling proc stat. + COLLECTOR.start() + + cpu_times_guest = Gauge('cpu_times_guest', '') + cpu_times_guest_nice = Gauge('cpu_times_guest_nice', '') + cpu_times_idle = Gauge('cpu_times_idle', '') + cpu_times_iowait = Gauge('cpu_times_iowait', '') + cpu_times_irq = Gauge('cpu_times_irq', '') + cpu_times_nice = Gauge('cpu_times_nice', '') + cpu_times_softirq = Gauge('cpu_times_softirq', '') + cpu_times_steal = Gauge('cpu_times_steal', '') + cpu_times_system = Gauge('cpu_times_system', '') + cpu_times_user = Gauge('cpu_times_user', '') + + cpu_stats_ctx_switches = Gauge('cpu_stats_ctx_switches', '') + cpu_stats_interrupts = Gauge('cpu_stats_interrupts', '') + cpu_stats_soft_interrupts = Gauge('cpu_stats_soft_interrupts', '') + cpu_stats_syscalls = Gauge('cpu_stats_syscalls', '') + + # The collector will write proc stat samples to the file descriptor we handed it above. + # We will read those here, and update the prometheus stats according to these samples. + with fdopen(pipe_read_fd, "rb", 1024) as f: + it = COLLECTOR.read_dump(f) + # TODO(oschaaf): Add an option here to also stream the raw data to another fd, + # as we loose information in the summary we serve over http. This could be helpfull + # when in-depth analysis is desired of an observed problem. + for entry in it: + cpu_times_guest.set(entry["cpu_times"].guest) + cpu_times_guest_nice.set(entry["cpu_times"].guest_nice) + cpu_times_idle.set(entry["cpu_times"].idle) + cpu_times_iowait.set(entry["cpu_times"].iowait) + cpu_times_irq.set(entry["cpu_times"].irq) + cpu_times_nice.set(entry["cpu_times"].nice) + cpu_times_softirq.set(entry["cpu_times"].softirq) + cpu_times_steal.set(entry["cpu_times"].steal) + cpu_times_system.set(entry["cpu_times"].system) + cpu_times_user.set(entry["cpu_times"].user) + + cpu_stats_ctx_switches.set(entry["cpu_stats"].ctx_switches) + cpu_stats_interrupts.set(entry["cpu_stats"].interrupts) + cpu_stats_soft_interrupts.set(entry["cpu_stats"].soft_interrupts) + cpu_stats_syscalls.set(entry["cpu_stats"].syscalls) + print("stopped") diff --git a/procstat/modules/sampler.py b/procstat/modules/sampler.py new file mode 100755 index 0000000000..5c97e23685 --- /dev/null +++ b/procstat/modules/sampler.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 + +import psutil +import enum +import time + +class Sampler: + def __init__(self, process_names_of_interest=[], global_cpu_percent=True, global_cpu_times=True, per_cpu_percent=False, per_cpu_times=False): + self.process_attrs_of_interest = [ + "pid", "name", "cpu_times", "cpu_percent"] + self.global_cpu_percent = global_cpu_percent + self.global_cpu_times = global_cpu_times + self.per_cpu_percent = per_cpu_percent + self.per_cpu_times = per_cpu_times + self.processes_of_interest = self.get_processes_of_interest( + process_names_of_interest) + + def get_processes_of_interest(self, process_names_of_interest): + processes_of_interest = [] + for p in psutil.process_iter(attrs=self.process_attrs_of_interest): + if p.info["name"] in process_names_of_interest: + processes_of_interest.append(p) + return processes_of_interest + + def get_snapshot(self): + # This should be fast, as the measurement should least interfere with + # what we're trying to measure. Consider adding a benchmark test for + # this call to estimate the overhead we're adding. + o = {} + o["timestamp"] = time.time() + if self.global_cpu_percent: + o["cpu_percent"] = psutil.cpu_percent( + interval=0, percpu=False) + if self.global_cpu_times: + o["cpu_times"] = psutil.cpu_times(percpu=False) + if self.per_cpu_percent: + o["per_cpu_percent"] = psutil.cpu_percent( + interval=0, percpu=True) + if self.per_cpu_times: + o["per_cpu_times"] = psutil.cpu_times(percpu=True) + o["cpu_stats"] = psutil.cpu_stats() + o["processes"] = [] + for process in self.processes_of_interest: + attrs = {} + o["processes"].append(attrs) + for attr in self.process_attrs_of_interest: + attrs[attr] = process.info[attr] + + return o diff --git a/procstat/modules/yaml_formatter.py b/procstat/modules/yaml_formatter.py new file mode 100644 index 0000000000..4c9abe0e3b --- /dev/null +++ b/procstat/modules/yaml_formatter.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python3 + +import yaml +from enum import Enum +from yaml import CDumper as CDumper + +# We need to massage yaml serialization a bit we we'll output named tuples in a nice way. +# Also, we use CDumper, which is a little faster then the stock dumper. + + +def to_yaml(o): + def setup_yaml_formatting(): + def tuple_formatter(self, data): + if hasattr(data, '_asdict'): + return self.represent_dict(data._asdict()) + return self.represent_list(data) + + def enum_formatter(self, data): + return self.represent_data(repr(data)) + + yaml.Dumper.yaml_multi_representers[tuple] = tuple_formatter + yaml.Dumper.yaml_multi_representers[Enum] = enum_formatter + + setup_yaml_formatting() + return yaml.dump(o, Dumper=CDumper) diff --git a/procstat/proc-stat-sampler.py b/procstat/proc-stat-sampler.py new file mode 100755 index 0000000000..7bc411abda --- /dev/null +++ b/procstat/proc-stat-sampler.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python3 + +import argparse +from modules.collector import Collector +from modules.sampler import Sampler +import sys +import time +import os +import signal +import tempfile + +global COLLECTOR +global STOPPED + + +def signal_handler(a, b): + print("stopping... ") + global COLLECTOR, STOPPED + COLLECTOR.stop() + STOPPED = True + + +def main(): + parser = argparse.ArgumentParser(description='Proc stat sampler CLI') + parser.add_argument("--sample-frequency", type=int, default=1, + help='Number of samples to obtain per second. Defaults to 1 per second.') + parser.add_argument("--track-proc-name", type=str, nargs="*", + help='Optional process name(s) to track.', default=[]) + parser.add_argument("--dump-path", type=str, default=tempfile.mktemp(prefix="psd-", suffix=".pickle-dump"), + help='Path where the result will be written.') + args = parser.parse_args() + print("proc stat sampler") + print("--sample-frequency: ", args.sample_frequency) + print("--track-proc-name: ", args.track_proc_name) + print("--dump-path", args.dump_path) + print("SIGINT/SIGTERM (ctrl+c) to stop") + + signal.signal(signal.SIGINT, signal_handler) + signal.signal(signal.SIGTERM, signal_handler) + + sampler = Sampler(process_names_of_interest=args.track_proc_name) + global STOPPED, COLLECTOR + STOPPED = False + COLLECTOR = Collector(open(args.dump_path, "wb"), sampler=sampler) + COLLECTOR.start() + while not STOPPED: + time.sleep(1) + print("Stopped. Dump written to path '%s'" % args.dump_path) + + +if __name__ == "__main__": + main() diff --git a/procstat/prom.py b/procstat/prom.py new file mode 100755 index 0000000000..eefcb127cf --- /dev/null +++ b/procstat/prom.py @@ -0,0 +1,8 @@ +#!/usr/bin/env python3 + +import sys +from modules.prometheus_http import Prom + +if __name__ == '__main__': + p = Prom() + p.run(sys.argv[1:]) \ No newline at end of file diff --git a/procstat/requirements.txt b/procstat/requirements.txt new file mode 100644 index 0000000000..64ab8a6345 --- /dev/null +++ b/procstat/requirements.txt @@ -0,0 +1,2 @@ +prometheus-client==0.7.1 +psutil==5.7.0 diff --git a/procstat/tests/test_collector.py b/procstat/tests/test_collector.py new file mode 100755 index 0000000000..6d265b7066 --- /dev/null +++ b/procstat/tests/test_collector.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 + +import unittest +from time import sleep +from modules.collector import Collector + + +class TestCollector(unittest.TestCase): + def test_collector_dump(self): + RUN_SECONDS = 2 + SAMPLE_FREQUENCY = 10 + FILENAME="/tmp/foobar" + + with open(FILENAME, "wb") as file: + expected_snapshot_count = SAMPLE_FREQUENCY * RUN_SECONDS + collector = Collector(file, sample_interval=1/SAMPLE_FREQUENCY) + collector.start() + sleep(RUN_SECONDS) + collector.stop() + + with open(FILENAME, "rb") as file: + snapshots = list(collector.read_dump(file)) + self.assertEqual(expected_snapshot_count, len(snapshots)) + last_snapshot = snapshots[expected_snapshot_count - 1] + self.assertIn("cpu_percent", last_snapshot) + self.assertIn("cpu_times", last_snapshot) + +if __name__ == '__main__': + unittest.main() diff --git a/procstat/tests/test_prom.py b/procstat/tests/test_prom.py new file mode 100755 index 0000000000..e8815569fb --- /dev/null +++ b/procstat/tests/test_prom.py @@ -0,0 +1,11 @@ +#!/usr/bin/env python3 + +import unittest +import os + +class TestProm(unittest.TestCase): + def test_prom_101(self): + assert os.system("tests/test_prom.sh") == 0 + +if __name__ == '__main__': + unittest.main() diff --git a/procstat/tests/test_prom.sh b/procstat/tests/test_prom.sh new file mode 100755 index 0000000000..5a1df7a48e --- /dev/null +++ b/procstat/tests/test_prom.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +function cleanup() { + kill -s TERM $PID + wait $PID +} + +set -e +set -x +python3 prom.py --http-port 64634 & +PID=$! +trap cleanup EXIT +sleep 2 +curl --silent 127.0.0.1:64634 | grep cpu_stats_interrupts diff --git a/procstat/tests/test_sampler.py b/procstat/tests/test_sampler.py new file mode 100755 index 0000000000..c949d19ab4 --- /dev/null +++ b/procstat/tests/test_sampler.py @@ -0,0 +1,16 @@ +#!/usr/bin/env python3 + +import unittest +from modules.sampler import Sampler + + +class TestSampler(unittest.TestCase): + def test_get_snapshot(self): + sampler = Sampler() + pd = sampler.get_snapshot() + expected_keys = ["cpu_percent", "cpu_times"] + for key in expected_keys: + self.assertIn(key, pd) + +if __name__ == '__main__': + unittest.main() diff --git a/procstat/tests/test_yaml_formatter.py b/procstat/tests/test_yaml_formatter.py new file mode 100755 index 0000000000..fbb35bd1c9 --- /dev/null +++ b/procstat/tests/test_yaml_formatter.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python3 + +import unittest +from modules.yaml_formatter import to_yaml +from modules.collector import Collector +from time import sleep +from modules.sampler import Sampler +from multiprocessing import Process +import psutil + + +class TestSampler(unittest.TestCase): + def foo_process_run(self): + sleep(5) + + def test_yaml_formatting(self): + FILENAME="/tmp/foobar" + p = Process(target=self.foo_process_run) + p.start() + sampler = Sampler([psutil.Process(p.pid).name()]) + with open(FILENAME, "wb") as file: + collector = Collector(file, sample_interval=0.5, sampler=sampler) + collector.start() + sleep(2) + collector.stop() + p.kill() + with open(FILENAME, "rb") as file: + yaml = to_yaml(list(collector.read_dump(file))) + #print(yaml) + self.assertIn("timestamp:", yaml) + self.assertIn("cpu_times:", yaml) + self.assertIn("cpu_percent:", yaml) + self.assertIn(" pid: %s" % p.pid, yaml) + +if __name__ == '__main__': + unittest.main()