|
| 1 | +from typing import List, Optional, Tuple, Union |
| 2 | + |
| 3 | +import toolz # type: ignore[import-untyped] |
| 4 | +from attrs import define |
| 5 | +from dlt.common.time import ensure_pendulum_datetime |
| 6 | +from dlt.common.utils import digest128 |
| 7 | + |
| 8 | + |
| 9 | +@define |
| 10 | +class KafkaDecodingOptions: |
| 11 | + """ |
| 12 | + Options that control decoding of the Kafka event. |
| 13 | + """ |
| 14 | + |
| 15 | + key_type: Optional[str] = None |
| 16 | + value_type: Optional[str] = None |
| 17 | + format: Optional[str] = None |
| 18 | + include: Optional[str] = None |
| 19 | + select: Optional[str] = None |
| 20 | + |
| 21 | + def __attrs_post_init__(self): |
| 22 | + if self.format is None: |
| 23 | + self.format = "standard_v1" |
| 24 | + |
| 25 | + @classmethod |
| 26 | + def from_params( |
| 27 | + cls, |
| 28 | + key_type: List[str], |
| 29 | + value_type: List[str], |
| 30 | + format: List[str], |
| 31 | + include: List[str], |
| 32 | + select: List[str], |
| 33 | + ): |
| 34 | + output_format = None |
| 35 | + include_fields = None |
| 36 | + select_field = None |
| 37 | + if format: |
| 38 | + output_format = format[0] |
| 39 | + else: |
| 40 | + output_format = "standard_v1" |
| 41 | + if include: |
| 42 | + output_format = "flexible" |
| 43 | + include_fields = toolz.apply(str.strip, include[0].split(",")) |
| 44 | + if select: |
| 45 | + output_format = "flexible" |
| 46 | + select_field = select[0] |
| 47 | + return cls( |
| 48 | + key_type=key_type and key_type[0] or None, |
| 49 | + value_type=value_type and value_type[0] or None, |
| 50 | + format=output_format, |
| 51 | + include=include_fields, |
| 52 | + select=select_field, |
| 53 | + ) |
| 54 | + |
| 55 | + |
| 56 | +@define |
| 57 | +class KafkaEvent: |
| 58 | + """ |
| 59 | + Manage details of a typical Kafka event/message/record. |
| 60 | +
|
| 61 | + https://kafka.apache.org/intro#intro_concepts_and_terms |
| 62 | + """ |
| 63 | + |
| 64 | + ts: Tuple[int, int] |
| 65 | + topic: str |
| 66 | + partition: int |
| 67 | + offset: int |
| 68 | + key: Union[bytes, str] |
| 69 | + value: Union[bytes, str] |
| 70 | + |
| 71 | + def decode_text(self): |
| 72 | + if self.key is not None and isinstance(self.key, bytes): |
| 73 | + self.key = self.key.decode("utf-8") |
| 74 | + if self.value is not None and isinstance(self.value, bytes): |
| 75 | + self.value = self.value.decode("utf-8") |
| 76 | + |
| 77 | + def to_dict(self, options: KafkaDecodingOptions): |
| 78 | + # TODO: Make decoding from text optional. |
| 79 | + self.decode_text() |
| 80 | + |
| 81 | + message_id = digest128(self.topic + str(self.partition) + str(self.key)) |
| 82 | + |
| 83 | + # The standard message layout as defined per dlt and ingestr. |
| 84 | + standard_payload = { |
| 85 | + "partition": self.partition, |
| 86 | + "topic": self.topic, |
| 87 | + "key": self.key, |
| 88 | + "offset": self.offset, |
| 89 | + "ts": { |
| 90 | + "type": self.ts[0], |
| 91 | + "value": ensure_pendulum_datetime(self.ts[1] / 1e3), |
| 92 | + }, |
| 93 | + "data": self.value, |
| 94 | + } |
| 95 | + |
| 96 | + # Basic Kafka message processors providing two formats. |
| 97 | + # Returns the message value and metadata. |
| 98 | + # The legacy format `standard_v1` uses the field `_kafka_msg_id`, |
| 99 | + # while the future `standard_v2` format uses `_kafka__msg_id`, |
| 100 | + # better aligned with all the other fields. |
| 101 | + # |
| 102 | + # Currently, as of July 2025, `standard_v1` is used as the |
| 103 | + # default to not cause any breaking changes. |
| 104 | + |
| 105 | + if options.format == "standard_v1": |
| 106 | + UserWarning( |
| 107 | + "Future versions of ingestr will use the `standard_v2` output format. " |
| 108 | + "To retain compatibility, make sure to start using `format=standard_v1` early." |
| 109 | + ) |
| 110 | + return { |
| 111 | + "_kafka": standard_payload, |
| 112 | + "_kafka_msg_id": message_id, |
| 113 | + } |
| 114 | + |
| 115 | + if options.format == "standard_v2": |
| 116 | + standard_payload["msg_id"] = message_id |
| 117 | + return { |
| 118 | + "_kafka": standard_payload, |
| 119 | + } |
| 120 | + |
| 121 | + # Slightly advanced Kafka message processor providing basic means of projections. |
| 122 | + # include: A list of column names to include. |
| 123 | + # select: A single column name to select and drill down into. |
| 124 | + if options.format == "flexible": |
| 125 | + # TODO: Refactor by caching preparation steps. |
| 126 | + if options.include: |
| 127 | + include_keys = [ |
| 128 | + key == "value" and "data" or key for key in options.include |
| 129 | + ] |
| 130 | + return toolz.keyfilter(lambda k: k in include_keys, standard_payload) |
| 131 | + if options.select: |
| 132 | + # TODO: Instead of a simple dictionary getter, `jsonpointer` or `jqlang` |
| 133 | + # can provide easy access to deeper levels of nested data structures. |
| 134 | + key = options.select.replace("value", "data") |
| 135 | + return standard_payload.get(key) |
| 136 | + return standard_payload |
| 137 | + |
| 138 | + raise NotImplementedError(f"Unknown message processor format: {options.format}") |
0 commit comments