diff --git a/.gitignore b/.gitignore deleted file mode 100644 index fdd9e5b..0000000 --- a/.gitignore +++ /dev/null @@ -1,161 +0,0 @@ -# Created by https://www.toptal.com/developers/gitignore/api/python -# Edit at https://www.toptal.com/developers/gitignore?templates=python - -### Python ### -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# Data -*.json -*.txt -*.lock - -# Folder -cache/ -klue_dir/ -prediction/ -wandb/ -best_model/ - -# Checkpoint -*.pt -*.bin -*.pth - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ -cover/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -.pybuilder/ -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -# For a library or package, you might want to ignore these files since the code is -# intended to run in multiple environments; otherwise, check them in: -# .python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - -# pytype static type analyzer -.pytype/ - -# Cython debug symbols -cython_debug/ - -# End of https://www.toptal.com/developers/gitignore/api/python \ No newline at end of file diff --git a/data_augmentation.py b/data_augmentation.py new file mode 100644 index 0000000..b94c145 --- /dev/null +++ b/data_augmentation.py @@ -0,0 +1,104 @@ +import numpy as np +import pandas as pd +import pickle +from typing import List, Tuple, Union, Dict, Text + +def find_nth(string, substring, n): + if (n == 1): + return string.find(substring) + else: + return string.find(substring, find_nth(string, substring, n - 1) + 1) + +def entity_prepro(sentence, entity): + start_idx = find_nth(sentence, entity[0], entity[2]) + end_idx = start_idx + len(entity[0]) + + p_entity = { + "word" : entity[0], + "start_idx" : start_idx, + "end_idx" : end_idx, + "type" : entity[1] + } + + return p_entity + + +def data_organizing( + sentence : Text, + subjects : Tuple[str, str, int], + objects : Tuple[str, str, int] +) -> Union[Text, Dict[str, str], Dict[str, str]]: + + p_subjects = entity_prepro(sentence, subjects) + p_objects = entity_prepro(sentence, objects) + + return [sentence, p_subjects, p_objects] + +def augmentation( + tagged_sentences : Union[List[Tuple[str, str]]] +) -> List[Union[str, Dict, Dict]]: + + tagged_sentence_word_cnt = [] + + for sent in tagged_sentences: # 토큰별로 몇번째로 등장했는지 추가 + tmp = '' + count_tagged = [] + for tok, tag in sent: + count_tagged.append((tok, tag, tmp.count(tok)+1)) + tmp += tok + tagged_sentence_word_cnt.append(count_tagged) + + print("Number of Data to aumgented :", len(tagged_sentence_word_cnt)) + + augmented_data = [] + for tag_sent in tagged_sentence_word_cnt: + org_sent = "".join([tok for tok, tag, _ in tag_sent]) + obj_list = [(tok, tag, cnt) for tok, tag, cnt in tag_sent if tag_map[tag]!='O'] + sbj_list = [(tok, tag, cnt) for tok, tag, cnt in obj_list if tag in ['PERSON', 'ORGANIZATION']] + cand_list = [[org_sent, sbj, obj] for sbj in sbj_list for obj in obj_list if sbj!=obj] + augmented_data.extend([data_organizing(sent, sbj, obj) for sent, sbj, obj in cand_list]) + + print("Number of Augmented data :", len(augmented_data)) + + return augmented_data + +def main(): + using_tag = ['PERSON', 'LOCATION', 'ORGANIZATION', 'DATE', 'TIME', 'CITY'] + + tag_map = { + 'PERSON' : 'PER', + 'LOCATION' : 'LOC', + 'ORGANIZATION' : 'ORG', + 'CITY' : 'LOC', + 'COUNTRY' : 'ORG', #ORG + 'ARTIFACT' : 'O', + 'DATE' : 'DAT', + 'TIME' : 'DAT', + 'CIVILIZATION' : 'O', + 'ANIMAL' : 'O', + 'PLANT' : 'O', + 'QUANTITY' : 'NOH', + 'STUDY_FIELD' : 'O', + 'THEORY' : 'O', + 'EVENT' : 'O', #ORG + 'MATERIAL' : 'O', + 'TERM' : 'O', + 'OCCUPATION' : 'O', #직업 + 'DISEASE' : 'O', + 'O' : 'O', + } + with open('tagged_sentence.pickle', 'rb') as f: + tagged_sentence = pickle.load(f) + + aug_data = augmentation(tagged_sentence) + + augmented_data = pd.DataFrame(aug_data) + augmented_data.columns = ['sentence', 'subject_entity', 'object_entity'] + augmented_data['label'] = None + augmented_data['source'] = 'augmented' + + augmented_data.to_csv("augmented_data.csv", index=False) + + with open('augmented_data.pickle', 'wb') as f: + pickle.dump(augmented_data, f, pickle.HIGHEST_PROTOCOL) + \ No newline at end of file