diff --git a/.gitignore b/.gitignore index 490e7c5..544a456 100644 --- a/.gitignore +++ b/.gitignore @@ -29,6 +29,9 @@ MANIFEST data/patient_sequences.parquet data/CEHR-BERT_sample_patient_sequence.parquet +# physionet.org data +physionet.org/ + # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. diff --git a/data_import/load_ndjson_to_postgres.py b/data_import/load_ndjson_to_postgres.py new file mode 100644 index 0000000..ba03c36 --- /dev/null +++ b/data_import/load_ndjson_to_postgres.py @@ -0,0 +1,50 @@ +import json +import os + +import pandas as pd +from sqlalchemy import create_engine + + +engine = create_engine( + "postgresql://postgres:Ks%404374296342@localhost:5432/mimiciv_fhir" +) + +data_dir = "/home/kiarash/workspace/Clones/odyssey/physionet.org/files/mimic-iv-fhir-demo/2.0/mimic-fhir" + + +def flatten_data(data): + """Flatten nested dictionaries/lists in the data.""" + for record in data: + for key, value in record.items(): + if isinstance(value, list) or isinstance(value, dict): + record[key] = json.dumps(value) + return data + + +def load_ndjson_to_db(file_path, table_name): + with open(file_path, "r") as f: + data = [json.loads(line) for line in f] + data = flatten_data(data) + df = pd.json_normalize(data) + df.to_sql(table_name, engine, if_exists="replace", index=False) + print(f"Loaded {file_path} into {table_name}") + + +files_to_tables = { + "Condition.ndjson": "condition", + "Encounter.ndjson": "encounter", + "Medication.ndjson": "medication", + "MedicationAdministration.ndjson": "medication_administration", + "MedicationRequest.ndjson": "medication_request", + "ObservationChartevents.ndjson": "observation_chartevents", + "ObservationLabevents.ndjson": "observation_labevents", + "Patient.ndjson": "patient", + "Procedure.ndjson": "procedure", +} + +for file_name, table_name in files_to_tables.items(): + file_path = os.path.join(data_dir, file_name) + if os.path.exists(file_path): + load_ndjson_to_db(file_path, table_name) + else: + print(f"File {file_name} not found in {data_dir}") diff --git a/odyssey/data/mimiciv/collect.py b/odyssey/data/mimiciv/collect.py index ce5d6a2..f952e78 100644 --- a/odyssey/data/mimiciv/collect.py +++ b/odyssey/data/mimiciv/collect.py @@ -712,9 +712,9 @@ def group_conditions(self) -> None: if __name__ == "__main__": collector = FHIRDataCollector( - db_path="postgresql://postgres:pwd@localhost:5432/mimiciv-2.0", + db_path=f"postgresql://{os.getenv('DB_USER', 'postgres')}:{os.getenv('DB_PASSWORD', 'password')}@{os.getenv('DB_HOST', 'localhost')}:5432/{os.getenv('DB_NAME', 'mimiciv_fhir')}", schema="mimic_fhir", - save_dir="/mnt/data/odyssey/mimiciv_fhir1", + save_dir=os.getenv("SAVE_DIR", "~/default_path/data_files"), buffer_size=10000, ) collector.get_patient_data()