diff --git a/examples/label_example/README.md b/examples/label_example/README.md new file mode 100644 index 00000000..dbcb9bee --- /dev/null +++ b/examples/label_example/README.md @@ -0,0 +1,89 @@ +## NER Label Example + +This example shows how we start a search engine in streamlit and link the search results to stave. + +## Install extra dependencies + +To install from PyPI, +```bash +pip install forte.elastic +pip install forte.health +pip install stave +pip install streamlit +``` + +## Download spaCy model + +run the following command to download the model +```bash +pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_ner_bc5cdr_md-0.5.0.tar.gz +``` + +## Set up the configuration +Before run Elastic Searcher and Stave, we need to ensure that the current configuration is compatible with the environment of our computer. + +Please check and change the following configurations in `stave_config.yml`: + +1. Ensure `Stave.stave_db_path` is the correct path -> `$Home/.stave`, e.g., `"/home/name/.stave"`. +2. Ensure `Stave.username` and `Stave.pw`is `"admin"` and `"admin"`. + +## Prepare elastic searcher +Download corresponding elasticsearch archive from https://www.elastic.co/downloads/past-releases/elasticsearch-7-17-2, unzip it and run `elasticsearch-7-17-2/bin/elasticsearch` to start the service. + +Run the following to check if elasticsearch is running properly: +```bash +curl -XGET localhost:9200/_cluster/health?pretty +``` + +Make sure you create index 'elastic_indexer' in the cluster before working with this example, you can run the following command: +```bash +curl -X PUT localhost:9200/elastic_indexer +``` + +You can also follow the online blog for more information: + +https://www.elastic.co/guide/en/elasticsearch/reference/current/starting-elasticsearch.html + +## Run pipeline +First, you should start an Elastic Indexer backend. + +Now, open a terminal. You can run the following command to parse some files and index them. +```bash +python clinical__pipeline.py path_to_mimiciii/1.4/NOTEEVENTS.csv.gz path_to_mimiciii_output 10 1 +``` + +Here, we write out the raw data pack to `/path_to_sample_output`, and only index the first 10 notes. You can change the number to whatever you want in the above command. + +Also, we write the data into elasticsearch. You can run the command line to check whether the 10 notes are written into your database: + +```bash +curl -X GET localhost:9200/elastic_indexer/_search +``` + +## Run indexer and Stave +Again, you should start an Elastic Indexer backend. + +Then, to start the Stave server that our pipeline will connect to for visualization purposes, run +```bash +stave -s start -o -l -n 8899 +``` +Then, login with username (admin) and password (admin). + +Here, you need to make sure `Stave.url` in `stave_config.yml` is `"http://localhost:8899"`. Or you can change the port 8899 to any port you like. + +## Run streamlit + +To run streamlit, the python version should be >= 3.7.2. + +Now, open the terminal. Run the following command to start the streamlit. +```bash +streamlit run search_engine.py +``` + +Now open `http://localhost:8501` on your browser to access the streamlit interface. + +Next, you will see the reports shown on the interface. You can also search with the search engine. + +Click the report with link, it will link to Stave, the visualization and annotation page. + +Click the radio (Disease and Chemical) on the sidebar, you can see the annotations on the UI. diff --git a/examples/label_example/clinical_pipeline.py b/examples/label_example/clinical_pipeline.py new file mode 100644 index 00000000..24df20de --- /dev/null +++ b/examples/label_example/clinical_pipeline.py @@ -0,0 +1,61 @@ +import sys +import time + +from forte.data.data_pack import DataPack +from forte.data.readers import PlainTextReader +from forte.pipeline import Pipeline +from forte.processors.writers import PackIdJsonPackWriter +from fortex.health.readers import Mimic3DischargeNoteReader +from fortex.elastic import ElasticSearchPackIndexProcessor +from fortex.health.processors.ner_label_processor import NERLabelProcessor + + +def main( + input_path: str, + output_path: str, + max_packs: int = -1, + use_mimiciii_reader=1 + ): + + pl = Pipeline[DataPack]() + if use_mimiciii_reader == 1: + pl.set_reader( + Mimic3DischargeNoteReader(), + config={"max_num_notes": max_packs} + ) + else: + pl.set_reader(PlainTextReader()) + + config_for_ner = { + "labels": ["disease", "chemical"], + "lang": "en_ner_bc5cdr_md" + } + pl.add(NERLabelProcessor(), config=config_for_ner) + + pl.add( + ElasticSearchPackIndexProcessor(), + { + "indexer": { + "other_kwargs": {"refresh": True}, + } + }, + ) + pl.add( + PackIdJsonPackWriter(), + { + "output_dir": output_path, + "indent": 2, + "overwrite": True, + "drop_record": True, + "zip_pack": False, + }, + ) + + pl.initialize() + + for idx, pack in enumerate(pl.process_dataset(input_path)): + if (idx + 1) % 50 == 0: + print(f"{time.strftime('%m-%d %H:%M')}: Processed {idx + 1} packs") + + +main(sys.argv[1], sys.argv[2], int(sys.argv[3]), int(sys.argv[4])) diff --git a/examples/label_example/default_onto_project.json b/examples/label_example/default_onto_project.json new file mode 100644 index 00000000..901ce4f1 --- /dev/null +++ b/examples/label_example/default_onto_project.json @@ -0,0 +1,751 @@ +{ + "name": "clinical_pipeline_base", + "ontology": { + "name": "base_ontology", + "definitions": [ + { + "entry_name": "ft.onto.base_ontology.Token", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "A span based annotation :class:`Token`, used to represent a token or a word.", + "attributes": [ + { + "name": "pos", + "type": "str" + }, + { + "name": "ud_xpos", + "type": "str", + "description": "Language specific pos tag. Used in CoNLL-U Format. Refer to https://universaldependencies.org/format.html" + }, + { + "name": "lemma", + "type": "str", + "description": "Lemma or stem of word form." + }, + { + "name": "chunk", + "type": "str" + }, + { + "name": "ner", + "type": "str" + }, + { + "name": "sense", + "type": "str" + }, + { + "name": "is_root", + "type": "bool" + }, + { + "name": "ud_features", + "type": "Dict", + "key_type": "str", + "value_type": "str" + }, + { + "name": "ud_misc", + "type": "Dict", + "key_type": "str", + "value_type": "str" + } + ] + }, + { + "entry_name": "ft.onto.base_ontology.Subword", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "Used to represent subword tokenization results.", + "attributes": [ + { + "name": "is_first_segment", + "type": "bool" + }, + { + "name": "is_unk", + "type": "bool" + }, + { + "name": "vocab_id", + "type": "int" + } + ] + }, + { + "entry_name": "ft.onto.base_ontology.Classification", + "parent_entry": "forte.data.ontology.top.Generics", + "description": "Used to store values for classification prediction", + "attributes": [ + { + "name": "classification_result", + "type": "Dict", + "key_type": "str", + "value_type": "float" + } + ] + }, + { + "entry_name": "ft.onto.base_ontology.Document", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "A span based annotation `Document`, normally used to represent a document.", + "attributes": [ + { + "name": "document_class", + "type": "List", + "item_type": "str", + "description": "A list of class names that the document belongs to." + }, + { + "name": "sentiment", + "type": "Dict", + "key_type": "str", + "value_type": "float" + }, + { + "name": "classifications", + "type": "Dict", + "key_type": "str", + "value_type": "ft.onto.base_ontology.Classification", + "description": "Stores the classification results for this document. The key is the name/task of the classification, the value is an classification object storing the results." + } + ] + }, + { + "entry_name": "ft.onto.base_ontology.Sentence", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "A span based annotation `Sentence`, normally used to represent a sentence.", + "attributes": [ + { + "name": "speaker", + "type": "str" + }, + { + "name": "part_id", + "type": "int" + }, + { + "name": "sentiment", + "type": "Dict", + "key_type": "str", + "value_type": "float" + }, + { + "name": "classification", + "type": "Dict", + "key_type": "str", + "value_type": "float" + }, + { + "name": "classifications", + "type": "Dict", + "key_type": "str", + "value_type": "ft.onto.base_ontology.Classification", + "description": "Stores the classification results for this sentence. The key is the name/task of the classification, the value is an classification object storing the results." + } + ] + }, + { + "entry_name": "ft.onto.base_ontology.Phrase", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "A span based annotation `Phrase`.", + "attributes": [ + { + "name": "phrase_type", + "type": "str" + }, + { + "name": "headword", + "type": "ft.onto.base_ontology.Token" + } + ] + }, + { + "entry_name": "ft.onto.base_ontology.UtteranceContext", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "`UtteranceContext` represents the context part in dialogue." + }, + { + "entry_name": "ft.onto.base_ontology.Utterance", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "A span based annotation `Utterance`, normally used to represent an utterance in dialogue.", + "attributes": [ + { + "name": "speaker", + "type": "str" + } + ] + }, + { + "entry_name": "ft.onto.base_ontology.PredicateArgument", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "A span based annotation `PredicateArgument`, normally used to represent an argument of a predicate, can be linked to the predicate via the predicate link.", + "attributes": [ + { + "name": "ner_type", + "type": "str" + }, + { + "name": "predicate_lemma", + "type": "str" + }, + { + "name": "is_verb", + "type": "bool" + } + ] + }, + { + "entry_name": "ft.onto.base_ontology.EntityMention", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "A span based annotation `EntityMention`, normally used to represent an Entity Mention in a piece of text.", + "attributes": [ + { + "name": "ner_type", + "type": "str" + } + ] + }, + { + "entry_name": "ft.onto.base_ontology.EventMention", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "A span based annotation `EventMention`, used to refer to a mention of an event.", + "attributes": [ + { + "name": "event_type", + "type": "str" + } + ] + }, + { + "entry_name": "ft.onto.base_ontology.PredicateMention", + "parent_entry": "ft.onto.base_ontology.Phrase", + "description": "A span based annotation `PredicateMention`, normally used to represent a predicate (normally verbs) in a piece of text.", + "attributes": [ + { + "name": "predicate_lemma", + "type": "str" + }, + { + "name": "framenet_id", + "type": "str" + }, + { + "name": "is_verb", + "type": "bool" + } + ] + }, + { + "entry_name": "ft.onto.base_ontology.PredicateLink", + "parent_entry": "forte.data.ontology.top.Link", + "description": "A `Link` type entry which represent a semantic role link between a predicate and its argument.", + "attributes": [ + { + "name": "arg_type", + "type": "str", + "description": "The predicate link type." + } + ], + "parent_type": "ft.onto.base_ontology.PredicateMention", + "child_type": "ft.onto.base_ontology.PredicateArgument" + }, + { + "entry_name": "ft.onto.base_ontology.Dependency", + "parent_entry": "forte.data.ontology.top.Link", + "description": "A `Link` type entry which represent a syntactic dependency.", + "attributes": [ + { + "name": "dep_label", + "type": "str", + "description": "The dependency label." + }, + { + "name": "rel_type", + "type": "str" + } + ], + "parent_type": "ft.onto.base_ontology.Token", + "child_type": "ft.onto.base_ontology.Token" + }, + { + "entry_name": "ft.onto.base_ontology.EnhancedDependency", + "parent_entry": "forte.data.ontology.top.Link", + "description": "A `Link` type entry which represent a enhanced dependency: \n https://universaldependencies.org/u/overview/enhanced-syntax.html", + "attributes": [ + { + "name": "dep_label", + "type": "str", + "description": "The enhanced dependency label in Universal Dependency." + } + ], + "parent_type": "ft.onto.base_ontology.Token", + "child_type": "ft.onto.base_ontology.Token" + }, + { + "entry_name": "ft.onto.base_ontology.RelationLink", + "parent_entry": "forte.data.ontology.top.Link", + "description": "A `Link` type entry which represent a relation between two entity mentions", + "attributes": [ + { + "name": "rel_type", + "type": "str", + "description": "The type of the relation." + } + ], + "parent_type": "ft.onto.base_ontology.EntityMention", + "child_type": "ft.onto.base_ontology.EntityMention" + }, + { + "entry_name": "ft.onto.base_ontology.CrossDocEntityRelation", + "parent_entry": "forte.data.ontology.top.MultiPackLink", + "description": "A `Link` type entry which represent a relation between two entity mentions across the packs.", + "attributes": [ + { + "name": "rel_type", + "type": "str", + "description": "The type of the relation." + } + ], + "parent_type": "ft.onto.base_ontology.EntityMention", + "child_type": "ft.onto.base_ontology.EntityMention" + }, + { + "entry_name": "ft.onto.base_ontology.CoreferenceGroup", + "parent_entry": "forte.data.ontology.top.Group", + "description": "A group type entry that take `EntityMention`, as members, used to represent coreferent group of entities.", + "member_type": "ft.onto.base_ontology.EntityMention" + }, + { + "entry_name": "ft.onto.base_ontology.EventRelation", + "parent_entry": "forte.data.ontology.top.Link", + "description": "A `Link` type entry which represent a relation between two event mentions.", + "attributes": [ + { + "name": "rel_type", + "type": "str", + "description": "The type of the relation." + } + ], + "parent_type": "ft.onto.base_ontology.EventMention", + "child_type": "ft.onto.base_ontology.EventMention" + }, + { + "entry_name": "ft.onto.base_ontology.CrossDocEventRelation", + "parent_entry": "forte.data.ontology.top.MultiPackLink", + "description": "A `Link` type entry which represent a relation between two event mentions across the packs.", + "attributes": [ + { + "name": "rel_type", + "type": "str", + "description": "The type of the relation." + } + ], + "parent_type": "ft.onto.base_ontology.EventMention", + "child_type": "ft.onto.base_ontology.EventMention" + }, + { + "entry_name": "ft.onto.base_ontology.ConstituentNode", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "A span based annotation `ConstituentNode` to represent constituents in constituency parsing. This can also sentiment values annotated on the nodes.", + "attributes": [ + { + "name": "label", + "type": "str" + }, + { + "name": "sentiment", + "type": "Dict", + "key_type": "str", + "value_type": "float" + }, + { + "name": "is_root", + "type": "bool" + }, + { + "name": "is_leaf", + "type": "bool" + }, + { + "name": "parent_node", + "type": "ft.onto.base_ontology.ConstituentNode" + }, + { + "name": "children_nodes", + "type": "List", + "item_type": "ft.onto.base_ontology.ConstituentNode" + } + ] + }, + { + "entry_name": "ft.onto.base_ontology.Title", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "A span based annotation `Title`, normally used to represent a title." + }, + { + "entry_name": "ft.onto.base_ontology.Body", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "A span based annotation `Body`, normally used to represent a document body." + }, + { + "entry_name": "ft.onto.base_ontology.MCOption", + "parent_entry": "forte.data.ontology.top.Annotation" + }, + { + "entry_name": "ft.onto.base_ontology.MCQuestion", + "parent_entry": "forte.data.ontology.top.Annotation", + "attributes": [ + { + "name": "options", + "type": "List", + "item_type": "ft.onto.base_ontology.MCOption" + }, + { + "name": "answers", + "type": "List", + "item_type": "int" + } + ] + }, + { + "entry_name": "ft.onto.base_ontology.MRCQuestion", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "An `Annotation` type which represents an MRC question.", + "attributes": [ + { + "name": "qid", + "type": "int" + }, + { + "name": "answers", + "type": "List", + "item_type": "ft.onto.base_ontology.Phrase" + } + ] + }, + { + "entry_name": "ft.onto.base_ontology.Recording", + "parent_entry": "forte.data.ontology.top.AudioAnnotation", + "description": "A span based annotation `Recording`, normally used to represent a recording.", + "attributes": [ + { + "name": "recording_class", + "type": "List", + "item_type": "str", + "description": "A list of class names that the recording belongs to." + } + ] + }, + { + "entry_name": "ft.onto.base_ontology.AudioUtterance", + "parent_entry": "forte.data.ontology.top.AudioAnnotation", + "description": "A span based annotation `AudioUtterance`, normally used to represent an utterance in dialogue.", + "attributes": [ + { + "name": "speaker", + "type": "str" + } + ] + }, + { + "entry_name": "ftx.medical.clinical_ontology.NegationContext", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "A span based annotation `NegationContext`, used to represent the negation context of a named entity.", + "attributes": [ + { + "name": "polarity", + "type": "bool" + } + ] + }, + { + "entry_name": "ftx.medical.clinical_ontology.MedicalEntityMention", + "parent_entry": "ft.onto.base_ontology.EntityMention", + "description": "A span based annotation class MedicalEntityMention, used to represent an Entity Mention in medical domain", + "attributes": [ + { + "name": "umls_link", + "type": "str" + }, + { + "name": "umls_entities", + "type": "List", + "item_type": "ftx.medical.clinical_ontology.UMLSConceptLink" + } + ] + }, + { + "entry_name": "ftx.medical.clinical_ontology.MedicalArticle", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "An annotation based representation for the whole medical text chunk/document", + "attributes": [ + { + "name": "icd_version", + "type": "int", + "description": "The version of ICD-Coding being used." + }, + { + "name": "icd_code", + "type": "str", + "description": "The ICD code assigned to current medical article." + } + ] + }, + { + "entry_name": "ftx.medical.clinical_ontology.Disease", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "A span based annotation `Disease`, used to represent the diseases in a piece of clinical text." + }, + { + "entry_name": "ftx.medical.clinical_ontology.Chemical", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "A span based annotation `Chemical`, used to represent the chemical in a piece of clinical text." + } + ] + }, + "config": { + "legendConfigs": { + "ft.onto.base_ontology.Token": { + "is_selected": false, + "is_shown": true, + "attributes": { + "pos": false, + "ud_xpos": false, + "lemma": false, + "chunk": false, + "ner": false, + "sense": false + } + }, + "ft.onto.base_ontology.Subword": { + "is_selected": false, + "is_shown": true, + "attributes": {} + }, + "ft.onto.base_ontology.Classification": { + "is_selected": false, + "is_shown": false, + "attributes": {} + }, + "ft.onto.base_ontology.Document": { + "is_selected": false, + "is_shown": true, + "attributes": {} + }, + "ft.onto.base_ontology.Sentence": { + "is_selected": false, + "is_shown": true, + "attributes": { + "speaker": false + } + }, + "ft.onto.base_ontology.Phrase": { + "is_selected": false, + "is_shown": true, + "attributes": { + "phrase_type": false + } + }, + "ft.onto.base_ontology.UtteranceContext": { + "is_selected": false, + "is_shown": false + }, + "ft.onto.base_ontology.Utterance": { + "is_selected": false, + "is_shown": false, + "attributes": { + "speaker": false + } + }, + "ft.onto.base_ontology.PredicateArgument": { + "is_selected": false, + "is_shown": false, + "attributes": { + "ner_type": false, + "predicate_lemma": false + } + }, + "ft.onto.base_ontology.EntityMention": { + "is_selected": false, + "is_shown": true, + "attributes": { + "ner_type": false + } + }, + "ft.onto.base_ontology.EventMention": { + "is_selected": false, + "is_shown": true, + "attributes": { + "event_type": false + } + }, + "ft.onto.base_ontology.PredicateMention": { + "is_selected": false, + "is_shown": true, + "attributes": { + "predicate_lemma": false, + "framenet_id": false + } + }, + "ft.onto.base_ontology.PredicateLink": { + "is_selected": false, + "is_shown": false, + "attributes": { + "arg_type": false + } + }, + "ft.onto.base_ontology.Dependency": { + "is_selected": false, + "is_shown": false, + "attributes": { + "dep_label": false, + "rel_type": false + } + }, + "ft.onto.base_ontology.EnhancedDependency": { + "is_selected": false, + "is_shown": false, + "attributes": { + "dep_label": false + } + }, + "ft.onto.base_ontology.RelationLink": { + "is_selected": false, + "is_shown": true, + "attributes": { + "rel_type": false + } + }, + "ft.onto.base_ontology.CrossDocEntityRelation": { + "is_selected": false, + "is_shown": false, + "attributes": { + "rel_type": false + } + }, + "ft.onto.base_ontology.CoreferenceGroup": { + "is_selected": false, + "is_shown": false + }, + "ft.onto.base_ontology.EventRelation": { + "is_selected": false, + "is_shown": false, + "attributes": { + "rel_type": false + } + }, + "ft.onto.base_ontology.CrossDocEventRelation": { + "is_selected": false, + "is_shown": false, + "attributes": { + "rel_type": false + } + }, + "ft.onto.base_ontology.ConstituentNode": { + "is_selected": false, + "is_shown": false, + "attributes": { + "label": false + } + }, + "ft.onto.base_ontology.Title": { + "is_selected": false, + "is_shown": false + }, + "ft.onto.base_ontology.Body": { + "is_selected": false, + "is_shown": false + }, + "ft.onto.base_ontology.MCOption": { + "is_selected": false, + "is_shown": false + }, + "ft.onto.base_ontology.MCQuestion": { + "is_selected": false, + "is_shown": false, + "attributes": {} + }, + "ft.onto.base_ontology.MRCQuestion": { + "is_selected": false, + "is_shown": false, + "attributes": {} + }, + "ft.onto.base_ontology.Recording": { + "is_selected": false, + "is_shown": false, + "attributes": {} + }, + "ft.onto.base_ontology.AudioUtterance": { + "is_selected": false, + "is_shown": false, + "attributes": { + "speaker": false + } + }, + "ftx.medical.clinical_ontology.NegationContext": { + "is_selected": false, + "is_shown": true, + "attributes": {} + }, + "ftx.medical.clinical_ontology.MedicalEntityMention": { + "is_selected": false, + "is_shown": true, + "attributes": { + "umls_link": false + } + }, + "ftx.medical.clinical_ontology.MedicalArticle": { + "is_selected": false, + "is_shown": true, + "attributes": { + "icd_code": false + } + }, + "ftx.medical.clinical_ontology.Disease": { + "is_selected": false, + "is_shown": true + }, + "ftx.medical.clinical_ontology.Chemical": { + "is_selected": false, + "is_shown": true + } + }, + "scopeConfigs": { + "ft.onto.base_ontology.Token": false, + "ft.onto.base_ontology.Subword": false, + "ft.onto.base_ontology.Document": false, + "ft.onto.base_ontology.Sentence": false, + "ft.onto.base_ontology.Phrase": false, + "ft.onto.base_ontology.UtteranceContext": false, + "ft.onto.base_ontology.Utterance": false, + "ft.onto.base_ontology.PredicateArgument": false, + "ft.onto.base_ontology.EntityMention": false, + "ft.onto.base_ontology.EventMention": false, + "ft.onto.base_ontology.PredicateMention": false, + "ft.onto.base_ontology.ConstituentNode": false, + "ft.onto.base_ontology.Title": false, + "ft.onto.base_ontology.Body": false, + "ft.onto.base_ontology.MCOption": false, + "ft.onto.base_ontology.MCQuestion": false, + "ft.onto.base_ontology.MRCQuestion": false, + "ftx.medical.clinical_ontology.NegationContext": false, + "ftx.medical.clinical_ontology.MedicalEntityMention": false, + "ftx.medical.clinical_ontology.MedicalArticle": false, + "ftx.medical.clinical_ontology.Disease": false, + "ftx.medical.clinical_ontology.Chemical": false + }, + "layoutConfigs": { + "center-middle": "default-nlp", + "left": "default-meta", + "right": "default-attribute", + "center-bottom": "disable" + }, + "remoteConfigs": { + "pipelineUrl": "", + "doValidation": false, + "expectedName": "", + "inputFormat": "string", + "expectedRecords": {} + } + } +} \ No newline at end of file diff --git a/examples/label_example/demo/__init__.py b/examples/label_example/demo/__init__.py new file mode 100644 index 00000000..49ecbbf8 --- /dev/null +++ b/examples/label_example/demo/__init__.py @@ -0,0 +1 @@ +# ***automatically_generated*** diff --git a/examples/label_example/demo/clinical.py b/examples/label_example/demo/clinical.py new file mode 100644 index 00000000..68541b46 --- /dev/null +++ b/examples/label_example/demo/clinical.py @@ -0,0 +1,49 @@ +# ***automatically_generated*** +# ***source json:examples/clinical_pipeline/clinical_onto.json*** +# flake8: noqa +# mypy: ignore-errors +# pylint: skip-file +""" +Automatically generated ontology clinical. Do not change manually. +""" + +from dataclasses import dataclass +from forte.data.data_pack import DataPack +from forte.data.ontology.top import Annotation +from ft.onto.base_ontology import EntityMention + +__all__ = [ + "ClinicalEntityMention", + "Description", + "Body", +] + + +@dataclass +class ClinicalEntityMention(EntityMention): + """ + A span based annotation `ClinicalEntityMention`, normally used to represent an Entity Mention in a piece of clinical text. + """ + + def __init__(self, pack: DataPack, begin: int, end: int): + super().__init__(pack, begin, end) + + +@dataclass +class Description(Annotation): + """ + A span based annotation `Description`, used to represent the description in a piece of clinical note. + """ + + def __init__(self, pack: DataPack, begin: int, end: int): + super().__init__(pack, begin, end) + + +@dataclass +class Body(Annotation): + """ + A span based annotation `Body`, used to represent the actual content in a piece of clinical note. + """ + + def __init__(self, pack: DataPack, begin: int, end: int): + super().__init__(pack, begin, end) diff --git a/examples/label_example/search_engine.py b/examples/label_example/search_engine.py new file mode 100644 index 00000000..4c9bc98f --- /dev/null +++ b/examples/label_example/search_engine.py @@ -0,0 +1,93 @@ +import sqlite3 +from typing import List +import streamlit as st +from forte.common.configuration import Config +import yaml +from elasticsearch import Elasticsearch +from search_utils import all_search, index_search +from sqlite_utils import create_links, sqlite_insert, get_json, update_stave_db +import templates + + +def main(): + st.set_page_config(page_title="ForteHealth_Search_Engine", layout="wide") + + es = Elasticsearch(hosts=["http://localhost:9200/"]) + INDEX = "elastic_indexer" + + config = yaml.safe_load(open("stave_config.yml", "r")) + config = Config(config, default_hparams=None) + + default_project_json = get_json("default_onto_project.json") + + base_project_id = update_stave_db(default_project_json, config) + + st.title("Search the MIMIC III Data...") + search = st.text_input("Enter search words:") + + if not search: + records = {} + results = all_search(es, INDEX) + hits = results["hits"]["hits"] + + conn = sqlite3.connect(config.Stave.stave_db_path) + answers = [] + for idx, hit in enumerate(hits): + source = hit["_source"] + # The raw pack string and pack id (not database id) + raw_pack_str: str = source["pack_info"] + pack_id: str = source["doc_id"] + + # Now you can write the pack into the database and generate url. + item = { + "name": f"clinical_results_{idx}", + "textPack": raw_pack_str, + "project_id": base_project_id, + } + + db_id = sqlite_insert(conn, "stave_backend_document", item) + answers += [db_id] + print(pack_id, db_id) + + links: List[str] = create_links(config.Stave.url, answers) + + for link in links: + st.write(link, unsafe_allow_html=True) + + if search: + results = index_search(es, INDEX, search) + hits = results["hits"]["hits"] + + conn = sqlite3.connect(config.Stave.stave_db_path) + answers = [] + docs = [] + for idx, hit in enumerate(hits): + source = hit["_source"] + # The raw pack string and pack id (not database id) + raw_pack_str: str = source["pack_info"] + pack_id: str = source["doc_id"] + highlight = "...".join(hit["highlight"]["content"]) + # Now you can write the pack into the database and generate url. + item = { + "name": f"clinical_results_{idx}", + "textPack": raw_pack_str, + "project_id": base_project_id, + } + + db_id = sqlite_insert(conn, "stave_backend_document", item) + answers += [db_id] + + docs.append(highlight) + + links: List[str] = create_links(config.Stave.url, answers) + + for i, _ in enumerate(links): + st.write(links[i], unsafe_allow_html=True) + st.write( + templates.search_result(docs[i].replace("\n", " ")), + unsafe_allow_html=True, + ) + + +if __name__ == '__main__': + main() diff --git a/examples/label_example/search_utils.py b/examples/label_example/search_utils.py new file mode 100644 index 00000000..35f1e6c4 --- /dev/null +++ b/examples/label_example/search_utils.py @@ -0,0 +1,55 @@ +''' +this file defines search functions for searching data in elasticsearch. +''' + + +def all_search(es, index: str) -> dict: + """ + Args: + es: Elasticsearch client instance. + index: Name of the index we are going to use. + size: Number of results returned in each search. + """ + # search query + body = {"query": {"match_all": {}}} + + res = es.search(index=index, body=body) + + return res + + +def index_search(es, index: str, keywords: str) -> dict: + """ + Args: + es: Elasticsearch client instance. + index: Name of the index we are going to use. + keywords: Search keywords. + from_i: Start index of the results for pagination. + size: Number of results returned in each search. + """ + # search query + body = { + "query": { + "bool": { + "must": [ + { + "query_string": { + "query": keywords, + "fields": ["content"], + "default_operator": "AND", + } + } + ], + } + }, + "highlight": { + "pre_tags": [' '], + "post_tags": [""], + "fields": {"content": {}}, + }, + "aggs": {"match_count": {"value_count": {"field": "_id"}}}, + } + + res = es.search(index=index, body=body) + + return res diff --git a/examples/label_example/sqlite_utils.py b/examples/label_example/sqlite_utils.py new file mode 100644 index 00000000..38d61dce --- /dev/null +++ b/examples/label_example/sqlite_utils.py @@ -0,0 +1,87 @@ +""" +this file defines sqlite3 related utils for inserting data to +the database of stave. +""" +import json +from typing import List +import sqlite3 +import yaml +from stave_backend.lib.stave_session import StaveSession +from forte.common import Config + + +def sqlite_insert(conn, table, row): + """ + Args: + conn: connection + table: table name + row: inserted item + """ + cols: str = ", ".join('"{}"'.format(col) for col in row.keys()) + vals: str = ", ".join(":{}".format(col) for col in row.keys()) + sql: str = f'INSERT INTO "{table}" ({cols}) VALUES ({vals})' + cursor = conn.cursor() + cursor.execute(sql, row) + conn.commit() + return cursor.lastrowid + + +def create_links(url_stub: str, ids: List[int]) -> List[str]: + """ + Args: + url_stub: url of stave + ids: the doc ids of the reports + """ + links: List[str] = [] + + url_stub: str = url_stub.strip("/") + for temp_idm in ids: + links.append( + f"Report #{temp_idm}" + ) + return links + + +def get_json(path: str): + """ + Args: + path: the file path of the json file + """ + file_obj = open(path) + data = json.load(file_obj) + file_obj.close() + return data + + +def update_stave_db(default_project_json, config): + """ + Args: + default_project_json: the ontology configuration file + config: the configuration of Stave, including url, name, password, etc. + """ + project_id_base = 0 + with StaveSession(url=config.Stave.url) as session: + session.login(username=config.Stave.username, password=config.Stave.pw) + + projects = session.get_project_list().json() + project_names = [project["name"] for project in projects] + + if default_project_json["name"] in project_names: + + base_project = [ + proj + for proj in projects + if proj["name"] == default_project_json["name"] + ][0] + return base_project["id"] + + resp1 = session.create_project(json.dumps(default_project_json)) + project_id_base = json.loads(resp1.text)["id"] + + config = yaml.safe_load(open("stave_config.yml", "r")) + config = Config(config, default_hparams=None) + con = sqlite3.connect(config.Stave.stave_db_path) + + con.commit() + + return project_id_base diff --git a/examples/label_example/stave_config.yml b/examples/label_example/stave_config.yml new file mode 100644 index 00000000..1b5e0c7f --- /dev/null +++ b/examples/label_example/stave_config.yml @@ -0,0 +1,5 @@ +Stave: + stave_db_path: "$HOME//.stave//db.sqlite3" + url: "http://localhost:8899" + username: admin + pw: admin diff --git a/examples/label_example/templates.py b/examples/label_example/templates.py new file mode 100644 index 00000000..5278a33a --- /dev/null +++ b/examples/label_example/templates.py @@ -0,0 +1,21 @@ +""" +This file defines some HTML templates +""" + + +def number_of_results(total_hits: int, duration: float) -> str: + """HTML scripts to display number of results and duration.""" + return f""" +
+ {total_hits} results ({duration:.2f} seconds) +

+ """ + + +def search_result(highlights: str) -> str: + """HTML scripts to display search results.""" + return f""" +
+ {highlights} +
+ """ diff --git a/fortex/health/ontology_specs/clinical_ontology.json b/fortex/health/ontology_specs/clinical_ontology.json index a9269abd..ed8df678 100644 --- a/fortex/health/ontology_specs/clinical_ontology.json +++ b/fortex/health/ontology_specs/clinical_ontology.json @@ -1,407 +1,407 @@ -{ - "name": "clinical_ontology", - "imports": [ - "base_ontology.json" - ], - "additional_prefixes": [ - "ftx.medical.clinical_ontology" - ], - "definitions": [ - { - "entry_name": "ftx.medical.clinical_ontology.ClinicalEntityMention", - "parent_entry": "ft.onto.base_ontology.EntityMention", - "description": "A span based annotation `ClinicalEntityMention`, normally used to represent an Entity Mention in a piece of clinical text." - }, - { - "entry_name": "ftx.medical.clinical_ontology.Description", - "parent_entry": "forte.data.ontology.top.Annotation", - "description": "A span based annotation `Description`, used to represent the description in a piece of clinical note." - }, - { - "entry_name": "ftx.medical.clinical_ontology.Body", - "parent_entry": "forte.data.ontology.top.Annotation", - "description": "A span based annotation `Body`, used to represent the actual content in a piece of clinical note." - }, - { - "entry_name": "ftx.medical.clinical_ontology.FrequencyAnnotation", - "parent_entry": "forte.data.ontology.top.Annotation", - "description": "The frequency determination for the Drug NER profile." - }, - { - "entry_name": "ftx.medical.clinical_ontology.DurationAnnotation", - "parent_entry": "forte.data.ontology.top.Annotation", - "description": "The duration determination for the Drug NER profile." - }, - { - "entry_name": "ftx.medical.clinical_ontology.RouteAnnotation", - "parent_entry": "forte.data.ontology.top.Annotation", - "description": "The route determination for the Drug NER profile.", - "attributes": [ - { - "name": "in_take_method", - "type": "str" - } - ] - }, - { - "entry_name": "ftx.medical.clinical_ontology.SuffixStrengthAnnotation", - "parent_entry": "forte.data.ontology.top.Annotation", - "description": "The suffix portion of dosage strength determination for the Drug NER profile." - }, - { - "entry_name": "ftx.medical.clinical_ontology.FractionStrengthAnnotation", - "parent_entry": "forte.data.ontology.top.Annotation", - "description": "The fraction portion of dosages strength determination for the Drug NER profile." - }, - { - "entry_name": "ftx.medical.clinical_ontology.RangeStrengthAnnotation", - "parent_entry": "forte.data.ontology.top.Annotation", - "description": "The range portion of dosages stength determination for the Drug NER profile." - }, - { - "entry_name": "ftx.medical.clinical_ontology.DecimalStrengthAnnotation", - "parent_entry": "forte.data.ontology.top.Annotation", - "description": "The decimal portion of dosages stength determination for the Drug NER profile" - }, - { - "entry_name": "ftx.medical.clinical_ontology.DrugChangeStatusAnnotation", - "parent_entry": "forte.data.ontology.top.Annotation", - "description": "The change status of dosages determination for the Drug NER profile.", - "attributes": [ - { - "name": "change_status", - "type": "str", - "description": "Indicates the drug change status of 'stop', 'start', 'increase', 'decrease', or 'noChange'." - } - ] - }, - { - "entry_name": "ftx.medical.clinical_ontology.DosagesAnnotation", - "parent_entry": "forte.data.ontology.top.Annotation", - "description": "The dosage determination for the Drug NER profile." - }, - { - "entry_name": "ftx.medical.clinical_ontology.StrengthAnnotation", - "parent_entry": "forte.data.ontology.top.Annotation", - "description": "Holds the value representing the unit of the drug dosage." - }, - { - "entry_name": "ftx.medical.clinical_ontology.StrengthUnitAnnotation", - "parent_entry": "forte.data.ontology.top.Annotation", - "description": "" - }, - { - "entry_name": "ftx.medical.clinical_ontology.FrequencyUnitAnnotation", - "parent_entry": "forte.data.ontology.top.Annotation", - "description": "The value represents the unit portion of the drug frequency.", - "attributes": [ - { - "name": "period", - "type": "float", - "description": "The periodic unit used, e.g day, month, hour, etc." - } - ] - }, - { - "entry_name": "ftx.medical.clinical_ontology.FormAnnotation", - "parent_entry": "forte.data.ontology.top.Annotation", - "description": "The value represents the form portion of the drug mention." - }, - { - "entry_name": "ftx.medical.clinical_ontology.SubSectionAnnotation", - "parent_entry": "forte.data.ontology.top.Annotation", - "description": "", - "attributes": [ - { - "name": "sub_ssection_body_begin", - "type": "int", - "description": "Sub-section body begin offset." - }, - { - "name": "sub_section_body_end", - "type": "int", - "description": "Sub-section body end offset." - }, - { - "name": "status", - "type": "int", - "description": "Status of 'possible', 'history of', or 'family history of'." - }, - { - "name": "sub_section_header_begin", - "type": "int", - "description": "Begin offset of subSection header" - }, - { - "name": "sub_section_header_end", - "type": "int", - "description": "Ending offset of subsection header" - }, - { - "name": "parent_section_id", - "type": "str", - "description": "The section in which the subsection was found." - } - ] - }, - { - "entry_name": "ftx.medical.clinical_ontology.DrugMentionAnnotation", - "parent_entry": "forte.data.ontology.top.Annotation", - "description": "", - "attributes": [ - { - "name": "status", - "type": "int", - "description": "" - }, - { - "name": "confidence", - "type": "float", - "description": "The confidence of the annotation." - }, - { - "name": "frequency", - "type": "str", - "description": "Frequency refers to how often the patient needs to take the drug. Frequency is divided into frequency number and frequency unit. E.g. twice daily" - }, - { - "name": "frequency_begin", - "type": "int", - "description": "" - }, - { - "name": "frequency_end", - "type": "int", - "description": "" - }, - { - "name": "duration", - "type": "str", - "description": "Duration refers to for how long the patient is expected to take the drug. E.g. 'for 2 weeks' Strongly encouraged to use bold text" - }, - { - "name": "duration_begin", - "type": "int", - "description": "" - }, - { - "name": "duration_end", - "type": "int", - "description": "" - }, - { - "name": "route", - "type": "str", - "description": "Medication route refers to the way that a drug is introduced into the body. E.g oral Strongly encouraged to use bold text" - }, - { - "name": "route_begin", - "type": "int", - "description": "" - }, - { - "name": "route_end", - "type": "int", - "description": "" - }, - { - "name": "drug_change_status", - "type": "str", - "description": "Status refers to the whether the medication is currently being taken or not." - }, - { - "name": "dosage", - "type": "str", - "description": "Dosage refers to how many of each drug the patient is taking. E.g. 5 mg" - }, - { - "name": "dosage_begin", - "type": "int", - "description": "" - }, - { - "name": "dosage_end", - "type": "int", - "description": "" - }, - { - "name": "strength", - "type": "str", - "description": "" - }, - { - "name": "strength_begin", - "type": "int", - "description": "" - }, - { - "name": "strength_end", - "type": "int", - "description": "" - }, - { - "name": "strength_unit", - "type": "str", - "description": "" - }, - { - "name": "su_begin", - "type": "int", - "description": "" - }, - { - "name": "su_end", - "type": "int", - "description": "" - }, - { - "name": "form", - "type": "str", - "description": "Form refers to the physical appearance of the drug. E.g. cream" - }, - { - "name": "form_begin", - "type": "int", - "description": "" - }, - { - "name": "form_end", - "type": "int", - "description": "" - }, - { - "name": "frequency_unit", - "type": "str", - "description": "" - }, - { - "name": "fu_begin", - "type": "int", - "description": "" - }, - { - "name": "fu_end", - "type": "int", - "description": "" - }, - { - "name": "start_date", - "type": "str", - "description": "" - }, - { - "name": "reason", - "type": "Dict", - "key_type": "str", - "value_type": "int" - }, - { - "name": "change_status_begin", - "type": "int", - "description": "" - }, - { - "name": "change_status_end", - "type": "int", - "description": "" - } - ] - }, - { - "entry_name": "ftx.medical.clinical_ontology.ChunkAnnotation", - "parent_entry": "forte.data.ontology.top.Annotation", - "description": "The value represents the unit portion of the drug frequency.", - "attributes": [ - { - "name": "sentence_id", - "type": "str", - "description": "" - } - ] - }, - { - "entry_name": "ftx.medical.clinical_ontology.DrugLookupWindowAnnotation", - "parent_entry": "forte.data.ontology.top.Annotation", - "description": "Similar to LookupWindowAnnotation however, these annotations are restricted to the segments/sections specified in the parameter - sectionOverrideSet - in DrugCNP2LookupWindow" - }, - { - "entry_name": "ftx.medical.clinical_ontology.NegationContext", - "parent_entry": "forte.data.ontology.top.Annotation", - "description": "A span based annotation `NegationContext`, used to represent the negation context of a named entity.", - "attributes": [ - { - "name": "polarity", - "type": "bool" - } - ] - }, - { - "entry_name": "ftx.medical.clinical_ontology.UMLSConceptLink", - "parent_entry": "forte.data.ontology.top.Generics", - "description": "A umls concept entity, used to represent basic information of a umls concept", - "attributes": [ - { - "name": "cui", - "type": "str" - }, - { - "name": "name", - "type": "str" - }, - { - "name": "definition", - "type": "str" - }, - { - "name": "tuis", - "type": "List", - "item_type": "str" - }, - { - "name": "aliases", - "type": "List", - "item_type": "str" - }, - { - "name": "score", - "type": "str" - } - ] - }, - { - "entry_name": "ftx.medical.clinical_ontology.MedicalEntityMention", - "parent_entry": "ft.onto.base_ontology.EntityMention", - "description": "A span based annotation class MedicalEntityMention, used to represent an Entity Mention in medical domain", - "attributes": [ - { - "name": "umls_link", - "type": "str" - }, - { - "name": "umls_entities", - "type": "List", - "item_type": "ftx.medical.clinical_ontology.UMLSConceptLink" - } - ] - }, - { - "entry_name": "ftx.medical.clinical_ontology.MedicalArticle", - "parent_entry": "forte.data.ontology.top.Annotation", - "description": "An annotation which represents the whole medical text chunk/document", - "attributes": [ - { - "name": "icd_version", - "type": "int", - "description": "The version of ICD-Coding being used." - }, - { - "name": "icd_code", - "type": "str", - "description": "The ICD code assigned to current medical article." - } - ] - }, - { +{ + "name": "clinical_ontology", + "imports": [ + "base_ontology.json" + ], + "additional_prefixes": [ + "ftx.medical.clinical_ontology" + ], + "definitions": [ + { + "entry_name": "ftx.medical.clinical_ontology.ClinicalEntityMention", + "parent_entry": "ft.onto.base_ontology.EntityMention", + "description": "A span based annotation `ClinicalEntityMention`, normally used to represent an Entity Mention in a piece of clinical text." + }, + { + "entry_name": "ftx.medical.clinical_ontology.Description", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "A span based annotation `Description`, used to represent the description in a piece of clinical note." + }, + { + "entry_name": "ftx.medical.clinical_ontology.Body", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "A span based annotation `Body`, used to represent the actual content in a piece of clinical note." + }, + { + "entry_name": "ftx.medical.clinical_ontology.FrequencyAnnotation", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "The frequency determination for the Drug NER profile." + }, + { + "entry_name": "ftx.medical.clinical_ontology.DurationAnnotation", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "The duration determination for the Drug NER profile." + }, + { + "entry_name": "ftx.medical.clinical_ontology.RouteAnnotation", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "The route determination for the Drug NER profile.", + "attributes": [ + { + "name": "in_take_method", + "type": "str" + } + ] + }, + { + "entry_name": "ftx.medical.clinical_ontology.SuffixStrengthAnnotation", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "The suffix portion of dosage strength determination for the Drug NER profile." + }, + { + "entry_name": "ftx.medical.clinical_ontology.FractionStrengthAnnotation", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "The fraction portion of dosages strength determination for the Drug NER profile." + }, + { + "entry_name": "ftx.medical.clinical_ontology.RangeStrengthAnnotation", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "The range portion of dosages stength determination for the Drug NER profile." + }, + { + "entry_name": "ftx.medical.clinical_ontology.DecimalStrengthAnnotation", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "The decimal portion of dosages stength determination for the Drug NER profile" + }, + { + "entry_name": "ftx.medical.clinical_ontology.DrugChangeStatusAnnotation", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "The change status of dosages determination for the Drug NER profile.", + "attributes": [ + { + "name": "change_status", + "type": "str", + "description": "Indicates the drug change status of 'stop', 'start', 'increase', 'decrease', or 'noChange'." + } + ] + }, + { + "entry_name": "ftx.medical.clinical_ontology.DosagesAnnotation", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "The dosage determination for the Drug NER profile." + }, + { + "entry_name": "ftx.medical.clinical_ontology.StrengthAnnotation", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "Holds the value representing the unit of the drug dosage." + }, + { + "entry_name": "ftx.medical.clinical_ontology.StrengthUnitAnnotation", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "" + }, + { + "entry_name": "ftx.medical.clinical_ontology.FrequencyUnitAnnotation", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "The value represents the unit portion of the drug frequency.", + "attributes": [ + { + "name": "period", + "type": "float", + "description": "The periodic unit used, e.g day, month, hour, etc." + } + ] + }, + { + "entry_name": "ftx.medical.clinical_ontology.FormAnnotation", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "The value represents the form portion of the drug mention." + }, + { + "entry_name": "ftx.medical.clinical_ontology.SubSectionAnnotation", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "", + "attributes": [ + { + "name": "sub_ssection_body_begin", + "type": "int", + "description": "Sub-section body begin offset." + }, + { + "name": "sub_section_body_end", + "type": "int", + "description": "Sub-section body end offset." + }, + { + "name": "status", + "type": "int", + "description": "Status of 'possible', 'history of', or 'family history of'." + }, + { + "name": "sub_section_header_begin", + "type": "int", + "description": "Begin offset of subSection header" + }, + { + "name": "sub_section_header_end", + "type": "int", + "description": "Ending offset of subsection header" + }, + { + "name": "parent_section_id", + "type": "str", + "description": "The section in which the subsection was found." + } + ] + }, + { + "entry_name": "ftx.medical.clinical_ontology.DrugMentionAnnotation", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "", + "attributes": [ + { + "name": "status", + "type": "int", + "description": "" + }, + { + "name": "confidence", + "type": "float", + "description": "The confidence of the annotation." + }, + { + "name": "frequency", + "type": "str", + "description": "Frequency refers to how often the patient needs to take the drug. Frequency is divided into frequency number and frequency unit. E.g. twice daily" + }, + { + "name": "frequency_begin", + "type": "int", + "description": "" + }, + { + "name": "frequency_end", + "type": "int", + "description": "" + }, + { + "name": "duration", + "type": "str", + "description": "Duration refers to for how long the patient is expected to take the drug. E.g. 'for 2 weeks' Strongly encouraged to use bold text" + }, + { + "name": "duration_begin", + "type": "int", + "description": "" + }, + { + "name": "duration_end", + "type": "int", + "description": "" + }, + { + "name": "route", + "type": "str", + "description": "Medication route refers to the way that a drug is introduced into the body. E.g oral Strongly encouraged to use bold text" + }, + { + "name": "route_begin", + "type": "int", + "description": "" + }, + { + "name": "route_end", + "type": "int", + "description": "" + }, + { + "name": "drug_change_status", + "type": "str", + "description": "Status refers to the whether the medication is currently being taken or not." + }, + { + "name": "dosage", + "type": "str", + "description": "Dosage refers to how many of each drug the patient is taking. E.g. 5 mg" + }, + { + "name": "dosage_begin", + "type": "int", + "description": "" + }, + { + "name": "dosage_end", + "type": "int", + "description": "" + }, + { + "name": "strength", + "type": "str", + "description": "" + }, + { + "name": "strength_begin", + "type": "int", + "description": "" + }, + { + "name": "strength_end", + "type": "int", + "description": "" + }, + { + "name": "strength_unit", + "type": "str", + "description": "" + }, + { + "name": "su_begin", + "type": "int", + "description": "" + }, + { + "name": "su_end", + "type": "int", + "description": "" + }, + { + "name": "form", + "type": "str", + "description": "Form refers to the physical appearance of the drug. E.g. cream" + }, + { + "name": "form_begin", + "type": "int", + "description": "" + }, + { + "name": "form_end", + "type": "int", + "description": "" + }, + { + "name": "frequency_unit", + "type": "str", + "description": "" + }, + { + "name": "fu_begin", + "type": "int", + "description": "" + }, + { + "name": "fu_end", + "type": "int", + "description": "" + }, + { + "name": "start_date", + "type": "str", + "description": "" + }, + { + "name": "reason", + "type": "Dict", + "key_type": "str", + "value_type": "int" + }, + { + "name": "change_status_begin", + "type": "int", + "description": "" + }, + { + "name": "change_status_end", + "type": "int", + "description": "" + } + ] + }, + { + "entry_name": "ftx.medical.clinical_ontology.ChunkAnnotation", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "The value represents the unit portion of the drug frequency.", + "attributes": [ + { + "name": "sentence_id", + "type": "str", + "description": "" + } + ] + }, + { + "entry_name": "ftx.medical.clinical_ontology.DrugLookupWindowAnnotation", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "Similar to LookupWindowAnnotation however, these annotations are restricted to the segments/sections specified in the parameter - sectionOverrideSet - in DrugCNP2LookupWindow" + }, + { + "entry_name": "ftx.medical.clinical_ontology.NegationContext", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "A span based annotation `NegationContext`, used to represent the negation context of a named entity.", + "attributes": [ + { + "name": "polarity", + "type": "bool" + } + ] + }, + { + "entry_name": "ftx.medical.clinical_ontology.UMLSConceptLink", + "parent_entry": "forte.data.ontology.top.Generics", + "description": "A umls concept entity, used to represent basic information of a umls concept", + "attributes": [ + { + "name": "cui", + "type": "str" + }, + { + "name": "name", + "type": "str" + }, + { + "name": "definition", + "type": "str" + }, + { + "name": "tuis", + "type": "List", + "item_type": "str" + }, + { + "name": "aliases", + "type": "List", + "item_type": "str" + }, + { + "name": "score", + "type": "str" + } + ] + }, + { + "entry_name": "ftx.medical.clinical_ontology.MedicalEntityMention", + "parent_entry": "ft.onto.base_ontology.EntityMention", + "description": "A span based annotation class MedicalEntityMention, used to represent an Entity Mention in medical domain", + "attributes": [ + { + "name": "umls_link", + "type": "str" + }, + { + "name": "umls_entities", + "type": "List", + "item_type": "ftx.medical.clinical_ontology.UMLSConceptLink" + } + ] + }, + { + "entry_name": "ftx.medical.clinical_ontology.MedicalArticle", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "An annotation which represents the whole medical text chunk/document", + "attributes": [ + { + "name": "icd_version", + "type": "int", + "description": "The version of ICD-Coding being used." + }, + { + "name": "icd_code", + "type": "str", + "description": "The ICD code assigned to current medical article." + } + ] + }, + { "entry_name": "ftx.medical.clinical_ontology.Abbreviation", "parent_entry": "forte.data.ontology.top.Annotation", "description": "A span based annotation `Abbreviation`, used to represent an abbreviated token..", @@ -411,8 +411,8 @@ "type": "str" } ] - }, - { + }, + { "entry_name": "ftx.medical.clinical_ontology.Hyponym", "parent_entry": "forte.data.ontology.top.Link", "description": "A `Link` type entry which represent a hyponym pair.", @@ -425,6 +425,16 @@ ], "parent_type": "ft.onto.base_ontology.Phrase", "child_type": "ft.onto.base_ontology.Phrase" - } - ] - } + }, + { + "entry_name": "ftx.medical.clinical_ontology.Disease", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "A span based annotation `Diesease`, used to represent the diseases in a piece of clinical text." + }, + { + "entry_name": "ftx.medical.clinical_ontology.Chemical", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "A span based annotation `Chemical`, used to represent the chemicals in a piece of clinical text." + } + ] +} \ No newline at end of file diff --git a/fortex/health/processors/ner_label_processor.py b/fortex/health/processors/ner_label_processor.py new file mode 100644 index 00000000..55932867 --- /dev/null +++ b/fortex/health/processors/ner_label_processor.py @@ -0,0 +1,146 @@ +# Copyright 2022 The Forte Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +NER Labeling Processor +""" + +from typing import Dict, Set +import spacy +from spacy.cli.download import download +from forte.data.data_pack import DataPack +from forte.processors.base import PackProcessor +from forte.common.configuration import Config +from forte.common.resources import Resources +from forte.common import ProcessExecutionException + + +from ftx.medical.clinical_ontology import Disease, Chemical + + +__all__ = [ + "NERLabelProcessor", +] + +CUSTOM_SPACYMODEL_URL = { + "en_core_sci_sm": "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy" + "/releases/v0.3.0/en_core_sci_sm-0.3.0.tar.gz", + "en_core_sci_md": "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy" + "/releases/v0.3.0/en_core_sci_md-0.3.0.tar.gz", + "en_core_sci_lg": "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy" + "/releases/v0.3.0/en_core_sci_lg-0.3.0.tar.gz", + "en_ner_craft_md": "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy" + "/releases/v0.3.0/en_ner_craft_md-0.3.0.tar.gz", + "en_ner_jnlpba_md": "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy" + "/releases/v0.3.0/en_ner_jnlpba_md-0.3.0.tar.gz", + "en_ner_bc5cdr_md": "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy" + "/releases/v0.3.0/en_ner_bc5cdr_md-0.3.0.tar.gz", + "en_ner_bionlp13cg_md": "https://s3-us-west-2.amazonaws.com/ai2-s2" + "-scispacy/releases/v0.3.0/en_ner_bionlp13cg_md-0" + ".3.0.tar.gz", +} + + +def load_lang_model(lang_model): + # pylint: disable=import-outside-toplevel + if lang_model in CUSTOM_SPACYMODEL_URL: + # download ScispaCy model using URL + import subprocess + import sys + import os + import importlib + + download_url = CUSTOM_SPACYMODEL_URL[lang_model] + command = [sys.executable, "-m", "pip", "install"] + [download_url] + subprocess.run( + command, env=os.environ.copy(), encoding="utf8", check=False + ) + cls = importlib.import_module(lang_model) + return cls.load() + else: + # Use spaCy download + try: + nlp = spacy.load(lang_model) + except OSError: + download(lang_model) + nlp = spacy.load(lang_model) + return nlp + + +class NERLabelProcessor(PackProcessor): + r""" + Implementation of this NERLabelProcessor has been based on spaCy + pretained model. A rendition of it that exists on github has + been referred to as well. + + Referred repository link: + https://github.com/explosion/spaCy + """ + + def __init__(self): + super().__init__() + self.nlp = None + + def initialize(self, resources: Resources, configs: Config): + super().initialize(resources, configs) + self.nlp = load_lang_model(configs.lang) + + def _process(self, input_pack: DataPack): + r""" + NER Label processing is based on spaCy. + """ + labels = self.configs.labels + + doc = input_pack.text + + # Do all process. + if self.nlp is None: + raise ProcessExecutionException( + "The SpaCy pipeline is not initialized, maybe you " + "haven't called the initialization function." + ) + result = self.nlp(doc) + + for ent in result.ents: + if "disease" in labels: + if ent.label_ == "DISEASE": + Disease( + pack=input_pack, begin=ent.start_char, end=ent.end_char + ) + if "chemical" in labels: + if ent.label_ == "CHEMICAL": + Chemical( + pack=input_pack, begin=ent.start_char, end=ent.end_char + ) + + @classmethod + def default_configs(cls): + r""" + This defines a basic config structure for `ICDCodingProcessor`. + + Following are the keys for this dictionary: + - `labels`: ner labels + + Returns: A dictionary with the default config for this processor. + """ + return {"labels": ["disease", "chemical"], "lang": "en_ner_bc5cdr_md"} + + def record(self, record_meta: Dict[str, Set[str]]): + r""" + + Args: + record_meta: the field in the datapack for type record that need to + fill in for consistency checking. + """ + record_meta["ft.onto.base_ontology.Disease"] = set() + record_meta["ft.onto.base_ontology.Chemical"] = set() diff --git a/fortex/health/readers/__init__.py b/fortex/health/readers/__init__.py index 076a48e7..d3745f4b 100644 --- a/fortex/health/readers/__init__.py +++ b/fortex/health/readers/__init__.py @@ -11,5 +11,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -from fortex.health.readers.mimic3_note_reader import * diff --git a/ftx/medical/clinical_ontology.py b/ftx/medical/clinical_ontology.py index 52019a75..6f782423 100644 --- a/ftx/medical/clinical_ontology.py +++ b/ftx/medical/clinical_ontology.py @@ -47,6 +47,8 @@ "MedicalArticle", "Abbreviation", "Hyponym", + "Disease", + "Chemical" ] @@ -492,3 +494,23 @@ class Hyponym(Link): def __init__(self, pack: DataPack, parent: Optional[Entry] = None, child: Optional[Entry] = None): super().__init__(pack, parent, child) self.hyponym_link: Optional[str] = None + + +@dataclass +class Disease(Annotation): + """ + A span based annotation `Disease`, used to represent the diseases in a piece of clinical text. + """ + + def __init__(self, pack: DataPack, begin: int, end: int): + super().__init__(pack, begin, end) + + +@dataclass +class Chemical(Annotation): + """ + A span based annotation `Chemical`, used to represent the chemical in a piece of clinical text. + """ + + def __init__(self, pack: DataPack, begin: int, end: int): + super().__init__(pack, begin, end) diff --git a/tests/fortex/health/processors/ner_label_processor_test.py b/tests/fortex/health/processors/ner_label_processor_test.py new file mode 100644 index 00000000..aba91a09 --- /dev/null +++ b/tests/fortex/health/processors/ner_label_processor_test.py @@ -0,0 +1,53 @@ +# Copyright 2022 The Forte Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Unit tests for ICDCodingProcessor +""" + +import unittest + +from ddt import data, ddt +from forte.data.data_pack import DataPack +from forte.data.readers import StringReader +from forte.pipeline import Pipeline +from fortex.health.processors.ner_label_processor import NERLabelProcessor +from ftx.medical.clinical_ontology import Chemical, Disease + + +@ddt +class TestNERLabelProcessor(unittest.TestCase): + @data("He got cancer, and he needs oxygen.") + def test_ner_label_processor(self, input_data): + self.nlp = Pipeline[DataPack]() + self.nlp.set_reader(StringReader()) + config = {"labels": ["disease", "chemical"], "lang": "en_ner_bc5cdr_md"} + + self.nlp.add(NERLabelProcessor(), config=config) + self.nlp.initialize() + pack = self.nlp.process(input_data) + + exp_disease = ["cancer"] + disease = [] + + for idx, d in enumerate(pack.get(Disease)): + disease.append(d.text) + + assert exp_disease == disease + + exp_chemical = ["oxygen"] + chemical = [] + for idx, c in enumerate(pack.get(Chemical)): + chemical.append(c.text) + + assert exp_chemical == chemical