Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 33 additions & 6 deletions sdgx/models/LLM/base.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import pandas as pd

from sdgx.exceptions import SynthesizerInitError
from sdgx.models.base import SynthesizerModel
from sdgx.utils import logger
Expand Down Expand Up @@ -87,13 +89,38 @@ def _check_access_type(self):
raise SynthesizerInitError("Duplicate data access type found.")

def _form_columns_description(self):
"""
We believe that giving information about a column helps improve data quality.

Currently, we leave this function to Good First Issue until March 2024, if unclaimed we will implement it quickly.
"""

raise NotImplementedError
df = self.raw_data # 确保 self.raw_data 是一个 pandas.DataFrame
desc_lines = []

for col in df.columns:
series = df[col]
dtype = series.dtype

if pd.api.types.is_numeric_dtype(dtype):
line = (
f'Column "{col}": type {dtype}, '
f"min {series.min()}, max {series.max()}, "
f"mean {series.mean():.2f}, std {series.std():.2f}."
)
elif pd.api.types.is_datetime64_any_dtype(dtype):
line = (
f'Column "{col}": type datetime, '
f'from {series.min().strftime("%Y-%m-%d")}, '
f'to {series.max().strftime("%Y-%m-%d")}.'
)
elif pd.api.types.is_categorical_dtype(series) or series.nunique() < 20:
values = series.unique()
line = (
f'Column "{col}": type category, '
f'{len(values)} categories: {list(values[:5])}{"..." if len(values) > 5 else ""}.'
)
else:
line = f'Column "{col}": type {dtype}.'

desc_lines.append(line)

return "\n".join(desc_lines)

def _form_message_with_offtable_features(self):
"""
Expand Down