diff --git a/examples/label_example/README.md b/examples/label_example/README.md new file mode 100644 index 00000000..dbcb9bee --- /dev/null +++ b/examples/label_example/README.md @@ -0,0 +1,89 @@ +## NER Label Example + +This example shows how we start a search engine in streamlit and link the search results to stave. + +## Install extra dependencies + +To install from PyPI, +```bash +pip install forte.elastic +pip install forte.health +pip install stave +pip install streamlit +``` + +## Download spaCy model + +run the following command to download the model +```bash +pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_ner_bc5cdr_md-0.5.0.tar.gz +``` + +## Set up the configuration +Before run Elastic Searcher and Stave, we need to ensure that the current configuration is compatible with the environment of our computer. + +Please check and change the following configurations in `stave_config.yml`: + +1. Ensure `Stave.stave_db_path` is the correct path -> `$Home/.stave`, e.g., `"/home/name/.stave"`. +2. Ensure `Stave.username` and `Stave.pw`is `"admin"` and `"admin"`. + +## Prepare elastic searcher +Download corresponding elasticsearch archive from https://www.elastic.co/downloads/past-releases/elasticsearch-7-17-2, unzip it and run `elasticsearch-7-17-2/bin/elasticsearch` to start the service. + +Run the following to check if elasticsearch is running properly: +```bash +curl -XGET localhost:9200/_cluster/health?pretty +``` + +Make sure you create index 'elastic_indexer' in the cluster before working with this example, you can run the following command: +```bash +curl -X PUT localhost:9200/elastic_indexer +``` + +You can also follow the online blog for more information: + +https://www.elastic.co/guide/en/elasticsearch/reference/current/starting-elasticsearch.html + +## Run pipeline +First, you should start an Elastic Indexer backend. + +Now, open a terminal. You can run the following command to parse some files and index them. +```bash +python clinical__pipeline.py path_to_mimiciii/1.4/NOTEEVENTS.csv.gz path_to_mimiciii_output 10 1 +``` + +Here, we write out the raw data pack to `/path_to_sample_output`, and only index the first 10 notes. You can change the number to whatever you want in the above command. + +Also, we write the data into elasticsearch. You can run the command line to check whether the 10 notes are written into your database: + +```bash +curl -X GET localhost:9200/elastic_indexer/_search +``` + +## Run indexer and Stave +Again, you should start an Elastic Indexer backend. + +Then, to start the Stave server that our pipeline will connect to for visualization purposes, run +```bash +stave -s start -o -l -n 8899 +``` +Then, login with username (admin) and password (admin). + +Here, you need to make sure `Stave.url` in `stave_config.yml` is `"http://localhost:8899"`. Or you can change the port 8899 to any port you like. + +## Run streamlit + +To run streamlit, the python version should be >= 3.7.2. + +Now, open the terminal. Run the following command to start the streamlit. +```bash +streamlit run search_engine.py +``` + +Now open `http://localhost:8501` on your browser to access the streamlit interface. + +Next, you will see the reports shown on the interface. You can also search with the search engine. + +Click the report with link, it will link to Stave, the visualization and annotation page. + +Click the radio (Disease and Chemical) on the sidebar, you can see the annotations on the UI. diff --git a/examples/label_example/clinical_pipeline.py b/examples/label_example/clinical_pipeline.py new file mode 100644 index 00000000..24df20de --- /dev/null +++ b/examples/label_example/clinical_pipeline.py @@ -0,0 +1,61 @@ +import sys +import time + +from forte.data.data_pack import DataPack +from forte.data.readers import PlainTextReader +from forte.pipeline import Pipeline +from forte.processors.writers import PackIdJsonPackWriter +from fortex.health.readers import Mimic3DischargeNoteReader +from fortex.elastic import ElasticSearchPackIndexProcessor +from fortex.health.processors.ner_label_processor import NERLabelProcessor + + +def main( + input_path: str, + output_path: str, + max_packs: int = -1, + use_mimiciii_reader=1 + ): + + pl = Pipeline[DataPack]() + if use_mimiciii_reader == 1: + pl.set_reader( + Mimic3DischargeNoteReader(), + config={"max_num_notes": max_packs} + ) + else: + pl.set_reader(PlainTextReader()) + + config_for_ner = { + "labels": ["disease", "chemical"], + "lang": "en_ner_bc5cdr_md" + } + pl.add(NERLabelProcessor(), config=config_for_ner) + + pl.add( + ElasticSearchPackIndexProcessor(), + { + "indexer": { + "other_kwargs": {"refresh": True}, + } + }, + ) + pl.add( + PackIdJsonPackWriter(), + { + "output_dir": output_path, + "indent": 2, + "overwrite": True, + "drop_record": True, + "zip_pack": False, + }, + ) + + pl.initialize() + + for idx, pack in enumerate(pl.process_dataset(input_path)): + if (idx + 1) % 50 == 0: + print(f"{time.strftime('%m-%d %H:%M')}: Processed {idx + 1} packs") + + +main(sys.argv[1], sys.argv[2], int(sys.argv[3]), int(sys.argv[4])) diff --git a/examples/label_example/default_onto_project.json b/examples/label_example/default_onto_project.json new file mode 100644 index 00000000..901ce4f1 --- /dev/null +++ b/examples/label_example/default_onto_project.json @@ -0,0 +1,751 @@ +{ + "name": "clinical_pipeline_base", + "ontology": { + "name": "base_ontology", + "definitions": [ + { + "entry_name": "ft.onto.base_ontology.Token", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "A span based annotation :class:`Token`, used to represent a token or a word.", + "attributes": [ + { + "name": "pos", + "type": "str" + }, + { + "name": "ud_xpos", + "type": "str", + "description": "Language specific pos tag. Used in CoNLL-U Format. Refer to https://universaldependencies.org/format.html" + }, + { + "name": "lemma", + "type": "str", + "description": "Lemma or stem of word form." + }, + { + "name": "chunk", + "type": "str" + }, + { + "name": "ner", + "type": "str" + }, + { + "name": "sense", + "type": "str" + }, + { + "name": "is_root", + "type": "bool" + }, + { + "name": "ud_features", + "type": "Dict", + "key_type": "str", + "value_type": "str" + }, + { + "name": "ud_misc", + "type": "Dict", + "key_type": "str", + "value_type": "str" + } + ] + }, + { + "entry_name": "ft.onto.base_ontology.Subword", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "Used to represent subword tokenization results.", + "attributes": [ + { + "name": "is_first_segment", + "type": "bool" + }, + { + "name": "is_unk", + "type": "bool" + }, + { + "name": "vocab_id", + "type": "int" + } + ] + }, + { + "entry_name": "ft.onto.base_ontology.Classification", + "parent_entry": "forte.data.ontology.top.Generics", + "description": "Used to store values for classification prediction", + "attributes": [ + { + "name": "classification_result", + "type": "Dict", + "key_type": "str", + "value_type": "float" + } + ] + }, + { + "entry_name": "ft.onto.base_ontology.Document", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "A span based annotation `Document`, normally used to represent a document.", + "attributes": [ + { + "name": "document_class", + "type": "List", + "item_type": "str", + "description": "A list of class names that the document belongs to." + }, + { + "name": "sentiment", + "type": "Dict", + "key_type": "str", + "value_type": "float" + }, + { + "name": "classifications", + "type": "Dict", + "key_type": "str", + "value_type": "ft.onto.base_ontology.Classification", + "description": "Stores the classification results for this document. The key is the name/task of the classification, the value is an classification object storing the results." + } + ] + }, + { + "entry_name": "ft.onto.base_ontology.Sentence", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "A span based annotation `Sentence`, normally used to represent a sentence.", + "attributes": [ + { + "name": "speaker", + "type": "str" + }, + { + "name": "part_id", + "type": "int" + }, + { + "name": "sentiment", + "type": "Dict", + "key_type": "str", + "value_type": "float" + }, + { + "name": "classification", + "type": "Dict", + "key_type": "str", + "value_type": "float" + }, + { + "name": "classifications", + "type": "Dict", + "key_type": "str", + "value_type": "ft.onto.base_ontology.Classification", + "description": "Stores the classification results for this sentence. The key is the name/task of the classification, the value is an classification object storing the results." + } + ] + }, + { + "entry_name": "ft.onto.base_ontology.Phrase", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "A span based annotation `Phrase`.", + "attributes": [ + { + "name": "phrase_type", + "type": "str" + }, + { + "name": "headword", + "type": "ft.onto.base_ontology.Token" + } + ] + }, + { + "entry_name": "ft.onto.base_ontology.UtteranceContext", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "`UtteranceContext` represents the context part in dialogue." + }, + { + "entry_name": "ft.onto.base_ontology.Utterance", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "A span based annotation `Utterance`, normally used to represent an utterance in dialogue.", + "attributes": [ + { + "name": "speaker", + "type": "str" + } + ] + }, + { + "entry_name": "ft.onto.base_ontology.PredicateArgument", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "A span based annotation `PredicateArgument`, normally used to represent an argument of a predicate, can be linked to the predicate via the predicate link.", + "attributes": [ + { + "name": "ner_type", + "type": "str" + }, + { + "name": "predicate_lemma", + "type": "str" + }, + { + "name": "is_verb", + "type": "bool" + } + ] + }, + { + "entry_name": "ft.onto.base_ontology.EntityMention", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "A span based annotation `EntityMention`, normally used to represent an Entity Mention in a piece of text.", + "attributes": [ + { + "name": "ner_type", + "type": "str" + } + ] + }, + { + "entry_name": "ft.onto.base_ontology.EventMention", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "A span based annotation `EventMention`, used to refer to a mention of an event.", + "attributes": [ + { + "name": "event_type", + "type": "str" + } + ] + }, + { + "entry_name": "ft.onto.base_ontology.PredicateMention", + "parent_entry": "ft.onto.base_ontology.Phrase", + "description": "A span based annotation `PredicateMention`, normally used to represent a predicate (normally verbs) in a piece of text.", + "attributes": [ + { + "name": "predicate_lemma", + "type": "str" + }, + { + "name": "framenet_id", + "type": "str" + }, + { + "name": "is_verb", + "type": "bool" + } + ] + }, + { + "entry_name": "ft.onto.base_ontology.PredicateLink", + "parent_entry": "forte.data.ontology.top.Link", + "description": "A `Link` type entry which represent a semantic role link between a predicate and its argument.", + "attributes": [ + { + "name": "arg_type", + "type": "str", + "description": "The predicate link type." + } + ], + "parent_type": "ft.onto.base_ontology.PredicateMention", + "child_type": "ft.onto.base_ontology.PredicateArgument" + }, + { + "entry_name": "ft.onto.base_ontology.Dependency", + "parent_entry": "forte.data.ontology.top.Link", + "description": "A `Link` type entry which represent a syntactic dependency.", + "attributes": [ + { + "name": "dep_label", + "type": "str", + "description": "The dependency label." + }, + { + "name": "rel_type", + "type": "str" + } + ], + "parent_type": "ft.onto.base_ontology.Token", + "child_type": "ft.onto.base_ontology.Token" + }, + { + "entry_name": "ft.onto.base_ontology.EnhancedDependency", + "parent_entry": "forte.data.ontology.top.Link", + "description": "A `Link` type entry which represent a enhanced dependency: \n https://universaldependencies.org/u/overview/enhanced-syntax.html", + "attributes": [ + { + "name": "dep_label", + "type": "str", + "description": "The enhanced dependency label in Universal Dependency." + } + ], + "parent_type": "ft.onto.base_ontology.Token", + "child_type": "ft.onto.base_ontology.Token" + }, + { + "entry_name": "ft.onto.base_ontology.RelationLink", + "parent_entry": "forte.data.ontology.top.Link", + "description": "A `Link` type entry which represent a relation between two entity mentions", + "attributes": [ + { + "name": "rel_type", + "type": "str", + "description": "The type of the relation." + } + ], + "parent_type": "ft.onto.base_ontology.EntityMention", + "child_type": "ft.onto.base_ontology.EntityMention" + }, + { + "entry_name": "ft.onto.base_ontology.CrossDocEntityRelation", + "parent_entry": "forte.data.ontology.top.MultiPackLink", + "description": "A `Link` type entry which represent a relation between two entity mentions across the packs.", + "attributes": [ + { + "name": "rel_type", + "type": "str", + "description": "The type of the relation." + } + ], + "parent_type": "ft.onto.base_ontology.EntityMention", + "child_type": "ft.onto.base_ontology.EntityMention" + }, + { + "entry_name": "ft.onto.base_ontology.CoreferenceGroup", + "parent_entry": "forte.data.ontology.top.Group", + "description": "A group type entry that take `EntityMention`, as members, used to represent coreferent group of entities.", + "member_type": "ft.onto.base_ontology.EntityMention" + }, + { + "entry_name": "ft.onto.base_ontology.EventRelation", + "parent_entry": "forte.data.ontology.top.Link", + "description": "A `Link` type entry which represent a relation between two event mentions.", + "attributes": [ + { + "name": "rel_type", + "type": "str", + "description": "The type of the relation." + } + ], + "parent_type": "ft.onto.base_ontology.EventMention", + "child_type": "ft.onto.base_ontology.EventMention" + }, + { + "entry_name": "ft.onto.base_ontology.CrossDocEventRelation", + "parent_entry": "forte.data.ontology.top.MultiPackLink", + "description": "A `Link` type entry which represent a relation between two event mentions across the packs.", + "attributes": [ + { + "name": "rel_type", + "type": "str", + "description": "The type of the relation." + } + ], + "parent_type": "ft.onto.base_ontology.EventMention", + "child_type": "ft.onto.base_ontology.EventMention" + }, + { + "entry_name": "ft.onto.base_ontology.ConstituentNode", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "A span based annotation `ConstituentNode` to represent constituents in constituency parsing. This can also sentiment values annotated on the nodes.", + "attributes": [ + { + "name": "label", + "type": "str" + }, + { + "name": "sentiment", + "type": "Dict", + "key_type": "str", + "value_type": "float" + }, + { + "name": "is_root", + "type": "bool" + }, + { + "name": "is_leaf", + "type": "bool" + }, + { + "name": "parent_node", + "type": "ft.onto.base_ontology.ConstituentNode" + }, + { + "name": "children_nodes", + "type": "List", + "item_type": "ft.onto.base_ontology.ConstituentNode" + } + ] + }, + { + "entry_name": "ft.onto.base_ontology.Title", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "A span based annotation `Title`, normally used to represent a title." + }, + { + "entry_name": "ft.onto.base_ontology.Body", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "A span based annotation `Body`, normally used to represent a document body." + }, + { + "entry_name": "ft.onto.base_ontology.MCOption", + "parent_entry": "forte.data.ontology.top.Annotation" + }, + { + "entry_name": "ft.onto.base_ontology.MCQuestion", + "parent_entry": "forte.data.ontology.top.Annotation", + "attributes": [ + { + "name": "options", + "type": "List", + "item_type": "ft.onto.base_ontology.MCOption" + }, + { + "name": "answers", + "type": "List", + "item_type": "int" + } + ] + }, + { + "entry_name": "ft.onto.base_ontology.MRCQuestion", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "An `Annotation` type which represents an MRC question.", + "attributes": [ + { + "name": "qid", + "type": "int" + }, + { + "name": "answers", + "type": "List", + "item_type": "ft.onto.base_ontology.Phrase" + } + ] + }, + { + "entry_name": "ft.onto.base_ontology.Recording", + "parent_entry": "forte.data.ontology.top.AudioAnnotation", + "description": "A span based annotation `Recording`, normally used to represent a recording.", + "attributes": [ + { + "name": "recording_class", + "type": "List", + "item_type": "str", + "description": "A list of class names that the recording belongs to." + } + ] + }, + { + "entry_name": "ft.onto.base_ontology.AudioUtterance", + "parent_entry": "forte.data.ontology.top.AudioAnnotation", + "description": "A span based annotation `AudioUtterance`, normally used to represent an utterance in dialogue.", + "attributes": [ + { + "name": "speaker", + "type": "str" + } + ] + }, + { + "entry_name": "ftx.medical.clinical_ontology.NegationContext", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "A span based annotation `NegationContext`, used to represent the negation context of a named entity.", + "attributes": [ + { + "name": "polarity", + "type": "bool" + } + ] + }, + { + "entry_name": "ftx.medical.clinical_ontology.MedicalEntityMention", + "parent_entry": "ft.onto.base_ontology.EntityMention", + "description": "A span based annotation class MedicalEntityMention, used to represent an Entity Mention in medical domain", + "attributes": [ + { + "name": "umls_link", + "type": "str" + }, + { + "name": "umls_entities", + "type": "List", + "item_type": "ftx.medical.clinical_ontology.UMLSConceptLink" + } + ] + }, + { + "entry_name": "ftx.medical.clinical_ontology.MedicalArticle", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "An annotation based representation for the whole medical text chunk/document", + "attributes": [ + { + "name": "icd_version", + "type": "int", + "description": "The version of ICD-Coding being used." + }, + { + "name": "icd_code", + "type": "str", + "description": "The ICD code assigned to current medical article." + } + ] + }, + { + "entry_name": "ftx.medical.clinical_ontology.Disease", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "A span based annotation `Disease`, used to represent the diseases in a piece of clinical text." + }, + { + "entry_name": "ftx.medical.clinical_ontology.Chemical", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "A span based annotation `Chemical`, used to represent the chemical in a piece of clinical text." + } + ] + }, + "config": { + "legendConfigs": { + "ft.onto.base_ontology.Token": { + "is_selected": false, + "is_shown": true, + "attributes": { + "pos": false, + "ud_xpos": false, + "lemma": false, + "chunk": false, + "ner": false, + "sense": false + } + }, + "ft.onto.base_ontology.Subword": { + "is_selected": false, + "is_shown": true, + "attributes": {} + }, + "ft.onto.base_ontology.Classification": { + "is_selected": false, + "is_shown": false, + "attributes": {} + }, + "ft.onto.base_ontology.Document": { + "is_selected": false, + "is_shown": true, + "attributes": {} + }, + "ft.onto.base_ontology.Sentence": { + "is_selected": false, + "is_shown": true, + "attributes": { + "speaker": false + } + }, + "ft.onto.base_ontology.Phrase": { + "is_selected": false, + "is_shown": true, + "attributes": { + "phrase_type": false + } + }, + "ft.onto.base_ontology.UtteranceContext": { + "is_selected": false, + "is_shown": false + }, + "ft.onto.base_ontology.Utterance": { + "is_selected": false, + "is_shown": false, + "attributes": { + "speaker": false + } + }, + "ft.onto.base_ontology.PredicateArgument": { + "is_selected": false, + "is_shown": false, + "attributes": { + "ner_type": false, + "predicate_lemma": false + } + }, + "ft.onto.base_ontology.EntityMention": { + "is_selected": false, + "is_shown": true, + "attributes": { + "ner_type": false + } + }, + "ft.onto.base_ontology.EventMention": { + "is_selected": false, + "is_shown": true, + "attributes": { + "event_type": false + } + }, + "ft.onto.base_ontology.PredicateMention": { + "is_selected": false, + "is_shown": true, + "attributes": { + "predicate_lemma": false, + "framenet_id": false + } + }, + "ft.onto.base_ontology.PredicateLink": { + "is_selected": false, + "is_shown": false, + "attributes": { + "arg_type": false + } + }, + "ft.onto.base_ontology.Dependency": { + "is_selected": false, + "is_shown": false, + "attributes": { + "dep_label": false, + "rel_type": false + } + }, + "ft.onto.base_ontology.EnhancedDependency": { + "is_selected": false, + "is_shown": false, + "attributes": { + "dep_label": false + } + }, + "ft.onto.base_ontology.RelationLink": { + "is_selected": false, + "is_shown": true, + "attributes": { + "rel_type": false + } + }, + "ft.onto.base_ontology.CrossDocEntityRelation": { + "is_selected": false, + "is_shown": false, + "attributes": { + "rel_type": false + } + }, + "ft.onto.base_ontology.CoreferenceGroup": { + "is_selected": false, + "is_shown": false + }, + "ft.onto.base_ontology.EventRelation": { + "is_selected": false, + "is_shown": false, + "attributes": { + "rel_type": false + } + }, + "ft.onto.base_ontology.CrossDocEventRelation": { + "is_selected": false, + "is_shown": false, + "attributes": { + "rel_type": false + } + }, + "ft.onto.base_ontology.ConstituentNode": { + "is_selected": false, + "is_shown": false, + "attributes": { + "label": false + } + }, + "ft.onto.base_ontology.Title": { + "is_selected": false, + "is_shown": false + }, + "ft.onto.base_ontology.Body": { + "is_selected": false, + "is_shown": false + }, + "ft.onto.base_ontology.MCOption": { + "is_selected": false, + "is_shown": false + }, + "ft.onto.base_ontology.MCQuestion": { + "is_selected": false, + "is_shown": false, + "attributes": {} + }, + "ft.onto.base_ontology.MRCQuestion": { + "is_selected": false, + "is_shown": false, + "attributes": {} + }, + "ft.onto.base_ontology.Recording": { + "is_selected": false, + "is_shown": false, + "attributes": {} + }, + "ft.onto.base_ontology.AudioUtterance": { + "is_selected": false, + "is_shown": false, + "attributes": { + "speaker": false + } + }, + "ftx.medical.clinical_ontology.NegationContext": { + "is_selected": false, + "is_shown": true, + "attributes": {} + }, + "ftx.medical.clinical_ontology.MedicalEntityMention": { + "is_selected": false, + "is_shown": true, + "attributes": { + "umls_link": false + } + }, + "ftx.medical.clinical_ontology.MedicalArticle": { + "is_selected": false, + "is_shown": true, + "attributes": { + "icd_code": false + } + }, + "ftx.medical.clinical_ontology.Disease": { + "is_selected": false, + "is_shown": true + }, + "ftx.medical.clinical_ontology.Chemical": { + "is_selected": false, + "is_shown": true + } + }, + "scopeConfigs": { + "ft.onto.base_ontology.Token": false, + "ft.onto.base_ontology.Subword": false, + "ft.onto.base_ontology.Document": false, + "ft.onto.base_ontology.Sentence": false, + "ft.onto.base_ontology.Phrase": false, + "ft.onto.base_ontology.UtteranceContext": false, + "ft.onto.base_ontology.Utterance": false, + "ft.onto.base_ontology.PredicateArgument": false, + "ft.onto.base_ontology.EntityMention": false, + "ft.onto.base_ontology.EventMention": false, + "ft.onto.base_ontology.PredicateMention": false, + "ft.onto.base_ontology.ConstituentNode": false, + "ft.onto.base_ontology.Title": false, + "ft.onto.base_ontology.Body": false, + "ft.onto.base_ontology.MCOption": false, + "ft.onto.base_ontology.MCQuestion": false, + "ft.onto.base_ontology.MRCQuestion": false, + "ftx.medical.clinical_ontology.NegationContext": false, + "ftx.medical.clinical_ontology.MedicalEntityMention": false, + "ftx.medical.clinical_ontology.MedicalArticle": false, + "ftx.medical.clinical_ontology.Disease": false, + "ftx.medical.clinical_ontology.Chemical": false + }, + "layoutConfigs": { + "center-middle": "default-nlp", + "left": "default-meta", + "right": "default-attribute", + "center-bottom": "disable" + }, + "remoteConfigs": { + "pipelineUrl": "", + "doValidation": false, + "expectedName": "", + "inputFormat": "string", + "expectedRecords": {} + } + } +} \ No newline at end of file diff --git a/examples/label_example/demo/__init__.py b/examples/label_example/demo/__init__.py new file mode 100644 index 00000000..49ecbbf8 --- /dev/null +++ b/examples/label_example/demo/__init__.py @@ -0,0 +1 @@ +# ***automatically_generated*** diff --git a/examples/label_example/demo/clinical.py b/examples/label_example/demo/clinical.py new file mode 100644 index 00000000..68541b46 --- /dev/null +++ b/examples/label_example/demo/clinical.py @@ -0,0 +1,49 @@ +# ***automatically_generated*** +# ***source json:examples/clinical_pipeline/clinical_onto.json*** +# flake8: noqa +# mypy: ignore-errors +# pylint: skip-file +""" +Automatically generated ontology clinical. Do not change manually. +""" + +from dataclasses import dataclass +from forte.data.data_pack import DataPack +from forte.data.ontology.top import Annotation +from ft.onto.base_ontology import EntityMention + +__all__ = [ + "ClinicalEntityMention", + "Description", + "Body", +] + + +@dataclass +class ClinicalEntityMention(EntityMention): + """ + A span based annotation `ClinicalEntityMention`, normally used to represent an Entity Mention in a piece of clinical text. + """ + + def __init__(self, pack: DataPack, begin: int, end: int): + super().__init__(pack, begin, end) + + +@dataclass +class Description(Annotation): + """ + A span based annotation `Description`, used to represent the description in a piece of clinical note. + """ + + def __init__(self, pack: DataPack, begin: int, end: int): + super().__init__(pack, begin, end) + + +@dataclass +class Body(Annotation): + """ + A span based annotation `Body`, used to represent the actual content in a piece of clinical note. + """ + + def __init__(self, pack: DataPack, begin: int, end: int): + super().__init__(pack, begin, end) diff --git a/examples/label_example/search_engine.py b/examples/label_example/search_engine.py new file mode 100644 index 00000000..4c9bc98f --- /dev/null +++ b/examples/label_example/search_engine.py @@ -0,0 +1,93 @@ +import sqlite3 +from typing import List +import streamlit as st +from forte.common.configuration import Config +import yaml +from elasticsearch import Elasticsearch +from search_utils import all_search, index_search +from sqlite_utils import create_links, sqlite_insert, get_json, update_stave_db +import templates + + +def main(): + st.set_page_config(page_title="ForteHealth_Search_Engine", layout="wide") + + es = Elasticsearch(hosts=["http://localhost:9200/"]) + INDEX = "elastic_indexer" + + config = yaml.safe_load(open("stave_config.yml", "r")) + config = Config(config, default_hparams=None) + + default_project_json = get_json("default_onto_project.json") + + base_project_id = update_stave_db(default_project_json, config) + + st.title("Search the MIMIC III Data...") + search = st.text_input("Enter search words:") + + if not search: + records = {} + results = all_search(es, INDEX) + hits = results["hits"]["hits"] + + conn = sqlite3.connect(config.Stave.stave_db_path) + answers = [] + for idx, hit in enumerate(hits): + source = hit["_source"] + # The raw pack string and pack id (not database id) + raw_pack_str: str = source["pack_info"] + pack_id: str = source["doc_id"] + + # Now you can write the pack into the database and generate url. + item = { + "name": f"clinical_results_{idx}", + "textPack": raw_pack_str, + "project_id": base_project_id, + } + + db_id = sqlite_insert(conn, "stave_backend_document", item) + answers += [db_id] + print(pack_id, db_id) + + links: List[str] = create_links(config.Stave.url, answers) + + for link in links: + st.write(link, unsafe_allow_html=True) + + if search: + results = index_search(es, INDEX, search) + hits = results["hits"]["hits"] + + conn = sqlite3.connect(config.Stave.stave_db_path) + answers = [] + docs = [] + for idx, hit in enumerate(hits): + source = hit["_source"] + # The raw pack string and pack id (not database id) + raw_pack_str: str = source["pack_info"] + pack_id: str = source["doc_id"] + highlight = "...".join(hit["highlight"]["content"]) + # Now you can write the pack into the database and generate url. + item = { + "name": f"clinical_results_{idx}", + "textPack": raw_pack_str, + "project_id": base_project_id, + } + + db_id = sqlite_insert(conn, "stave_backend_document", item) + answers += [db_id] + + docs.append(highlight) + + links: List[str] = create_links(config.Stave.url, answers) + + for i, _ in enumerate(links): + st.write(links[i], unsafe_allow_html=True) + st.write( + templates.search_result(docs[i].replace("\n", " ")), + unsafe_allow_html=True, + ) + + +if __name__ == '__main__': + main() diff --git a/examples/label_example/search_utils.py b/examples/label_example/search_utils.py new file mode 100644 index 00000000..35f1e6c4 --- /dev/null +++ b/examples/label_example/search_utils.py @@ -0,0 +1,55 @@ +''' +this file defines search functions for searching data in elasticsearch. +''' + + +def all_search(es, index: str) -> dict: + """ + Args: + es: Elasticsearch client instance. + index: Name of the index we are going to use. + size: Number of results returned in each search. + """ + # search query + body = {"query": {"match_all": {}}} + + res = es.search(index=index, body=body) + + return res + + +def index_search(es, index: str, keywords: str) -> dict: + """ + Args: + es: Elasticsearch client instance. + index: Name of the index we are going to use. + keywords: Search keywords. + from_i: Start index of the results for pagination. + size: Number of results returned in each search. + """ + # search query + body = { + "query": { + "bool": { + "must": [ + { + "query_string": { + "query": keywords, + "fields": ["content"], + "default_operator": "AND", + } + } + ], + } + }, + "highlight": { + "pre_tags": [' '], + "post_tags": [""], + "fields": {"content": {}}, + }, + "aggs": {"match_count": {"value_count": {"field": "_id"}}}, + } + + res = es.search(index=index, body=body) + + return res diff --git a/examples/label_example/sqlite_utils.py b/examples/label_example/sqlite_utils.py new file mode 100644 index 00000000..38d61dce --- /dev/null +++ b/examples/label_example/sqlite_utils.py @@ -0,0 +1,87 @@ +""" +this file defines sqlite3 related utils for inserting data to +the database of stave. +""" +import json +from typing import List +import sqlite3 +import yaml +from stave_backend.lib.stave_session import StaveSession +from forte.common import Config + + +def sqlite_insert(conn, table, row): + """ + Args: + conn: connection + table: table name + row: inserted item + """ + cols: str = ", ".join('"{}"'.format(col) for col in row.keys()) + vals: str = ", ".join(":{}".format(col) for col in row.keys()) + sql: str = f'INSERT INTO "{table}" ({cols}) VALUES ({vals})' + cursor = conn.cursor() + cursor.execute(sql, row) + conn.commit() + return cursor.lastrowid + + +def create_links(url_stub: str, ids: List[int]) -> List[str]: + """ + Args: + url_stub: url of stave + ids: the doc ids of the reports + """ + links: List[str] = [] + + url_stub: str = url_stub.strip("/") + for temp_idm in ids: + links.append( + f"Report #{temp_idm}" + ) + return links + + +def get_json(path: str): + """ + Args: + path: the file path of the json file + """ + file_obj = open(path) + data = json.load(file_obj) + file_obj.close() + return data + + +def update_stave_db(default_project_json, config): + """ + Args: + default_project_json: the ontology configuration file + config: the configuration of Stave, including url, name, password, etc. + """ + project_id_base = 0 + with StaveSession(url=config.Stave.url) as session: + session.login(username=config.Stave.username, password=config.Stave.pw) + + projects = session.get_project_list().json() + project_names = [project["name"] for project in projects] + + if default_project_json["name"] in project_names: + + base_project = [ + proj + for proj in projects + if proj["name"] == default_project_json["name"] + ][0] + return base_project["id"] + + resp1 = session.create_project(json.dumps(default_project_json)) + project_id_base = json.loads(resp1.text)["id"] + + config = yaml.safe_load(open("stave_config.yml", "r")) + config = Config(config, default_hparams=None) + con = sqlite3.connect(config.Stave.stave_db_path) + + con.commit() + + return project_id_base diff --git a/examples/label_example/stave_config.yml b/examples/label_example/stave_config.yml new file mode 100644 index 00000000..1b5e0c7f --- /dev/null +++ b/examples/label_example/stave_config.yml @@ -0,0 +1,5 @@ +Stave: + stave_db_path: "$HOME//.stave//db.sqlite3" + url: "http://localhost:8899" + username: admin + pw: admin diff --git a/examples/label_example/templates.py b/examples/label_example/templates.py new file mode 100644 index 00000000..5278a33a --- /dev/null +++ b/examples/label_example/templates.py @@ -0,0 +1,21 @@ +""" +This file defines some HTML templates +""" + + +def number_of_results(total_hits: int, duration: float) -> str: + """HTML scripts to display number of results and duration.""" + return f""" +