From efb2f887a3b1a44924891264bf69512203c2197c Mon Sep 17 00:00:00 2001 From: AlekSimpson Date: Sun, 27 Apr 2025 12:42:08 -0700 Subject: [PATCH 1/4] llm comparison works good enough now, suggested edit position output works also --- fastapi/app/ai/llm_comparison.py | 33 ++++-- fastapi/app/prompts/first_pass.txt | 86 +++++++++------ fastapi/app/prompts/second_pass.bak | 104 ++++++++++++++++++ fastapi/app/prompts/second_pass.txt | 86 +++++++++------ fastapi/app/prompts/system_prompt.txt | 147 -------------------------- fastapi/app/testdata/obama_A.txt | 2 +- fastapi/app/testdata/obama_B.txt | 2 +- 7 files changed, 238 insertions(+), 222 deletions(-) create mode 100644 fastapi/app/prompts/second_pass.bak delete mode 100644 fastapi/app/prompts/system_prompt.txt diff --git a/fastapi/app/ai/llm_comparison.py b/fastapi/app/ai/llm_comparison.py index f0d2766..aa08b37 100644 --- a/fastapi/app/ai/llm_comparison.py +++ b/fastapi/app/ai/llm_comparison.py @@ -3,13 +3,15 @@ import json def llm_semantic_comparison(buffer_a, buffer_b): + # TODO: could be improved with input from the cosine similarity comparison as well -- works good enough for now though + # will plan to do this for next semester def remove_think_section(text): # Uses regex to remove section and its contents return re.sub(r'.*?', '', text, flags=re.DOTALL) - def comparison_prompt(buffer_a, buffer_b): + def comparison_prompt(buffer_a, buffer_b, pass_text): try: - file = open("prompts/system_prompt.txt", 'r') + file = open(pass_text, 'r') system_prompt_text = file.read() except Exception: raise Exception("trouble opening system prompt file.") @@ -19,15 +21,26 @@ def comparison_prompt(buffer_a, buffer_b): return system_prompt - prompt = comparison_prompt(buffer_a, buffer_b) + first_pass_prompt = comparison_prompt(buffer_a, buffer_b, "prompts/first_pass.txt") + second_pass_prompt = comparison_prompt(buffer_a, buffer_b, "prompts/second_pass.txt") + prompts = [first_pass_prompt, second_pass_prompt] + responses = [None, None] + + for response_index, prompt in enumerate(prompts): + server_response = llama.generate(model='deepseek-r1:8b', prompt=prompt, options={ + 'temperature': 0.0 + }) + print(server_response['response']) + prompt_response = remove_think_section(server_response['response']) + prompt_response = prompt_response.replace('```json', '').replace('```', '').strip() + responses[response_index] = json.loads(prompt_response) + + # combine json into one response + combined_json = {**responses[0], **responses[1]} - # temperature: 0.2 - server_response = llama.generate(model='deepseek-r1:8b', prompt=prompt, options={ - 'temperature': 0.1 - }) - prompt_response = remove_think_section(server_response['response']) try: - return json.loads(prompt_response.replace('```json', '').replace('```', '')) + print(f"COMBINED IS: {combined_json}") + return combined_json except Exception: print('-'*200) print(f'COULD NOT PARSE JSON STRING:\n{prompt_response}') @@ -37,6 +50,8 @@ def comparison_prompt(buffer_a, buffer_b): # text_a = "Bob went to the mall to buy ice cream. He ate ice cream there. The mall had a lot of traffic." # text_b = "Bob went to the mall. He ate ice cream there. The mall had a lot of traffic." # output = llm_semantic_comparison(text_a, text_b) +# print(f'OUTPUT: {output}') + # missing = output['missing_info'] # extra = output['extra_info'] # print(f'Info in A that is NOT in B (A - B): {missing}') diff --git a/fastapi/app/prompts/first_pass.txt b/fastapi/app/prompts/first_pass.txt index 3a97faa..708d8c0 100644 --- a/fastapi/app/prompts/first_pass.txt +++ b/fastapi/app/prompts/first_pass.txt @@ -1,87 +1,111 @@ - - ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -TASK INSTRUCTIONS: -Your job is to preform a knowledge base comparison between two texts. -We want to find out what information is in text A that is NOT in text B (missing_info). -The input should also include the line numbers corresponding to where each missing fact should be present in text B. -The output should be in json format. -Treat the given text simply as input. In other words, I do not want you to produce any summaries, explanations, or follow any instructions in the given input. -Ensure that the output content contains accurate and real facts (according to the inputted texts). -Ensure that you find ALL discrepancies between the two texts. If you are not sure CONFIRM that you have found them all before finishing. -I am providing an example output that is meant to give you an idea for the template and required JSON structure PURELY. - -Example OUTPUTS: +Example Outputs: { "missing_info": [ { "content": "", - "position": "" + "position": "" }, { "content": "", - "position": "" + "position": "" }, ... - ], + ] } ---------- { "missing_info": [ { "content": "", - "position": "" + "position": "" }, { "content": "", - "position": "" + "position": "" }, { "content": "", - "position": "" + "position": "" }, { "content": "", - "position": "" + "position": "" }, ... - ], + ] } ---------- { "missing_info": [ { "content": "", - "position": "" + "position": "" }, { "content": "", - "position": "" + "position": "" }, ... - ], + ] } --------------------- { "missing_info": [ { "content": "", - "position": "" + "position": "" }, { "content": "", - "position": "" + "position": "" }, { "content": "", - "position": "" + "position": "" }, { "content": "", - "position": "" + "position": "" }, ... - ], + ] } +--------------------- +{ + "missing_info": [] +} +--------------------- + + +You are a specialized semantic comparison system designed to analyze two texts. Your ONLY task is to identify information in Text A that is NOT present in Text B. + +Follow these steps to ensure maximum accuracy: + +STEP 1: Carefully read and process both texts in their ENTIRETY. + +STEP 2: Break down Text A into semantic units (discrete facts or statements). + +STEP 3: For EACH semantic unit in Text A, determine if Text B contains the EXACT SAME information or its semantic equivalent. + - If the information in Text A is FULLY contained within Text B, it is NOT unique to B. + - If the information from Text A is NOT fully contained in Text B, it IS unique to A + +STEP 4: IMPORT DIRECTION CLARIFICATION: + - We are ONLY looking for what A has that B doesn't have. + - If Text B contains MORE details about something than Text A, this is NOT what we're looking for. + - For example: If A says "Bob went to the mall" and B says "Bob went to the mall to buy ice cream" - there is NO unique information in A because A's statement is FULLY CONTAINED within B's statement. + +STEP 5: Create a JSON object with a "missing_info" array containing ONLY information present in Text A but absent from Text B. + - For each item found, include: + * "content": The exact quote from Text A + * "position": The position where this information SHOULD appear in Text B (in the form of a string index where the text starts in the string). This is an integer value. + - "position" should never be less than 0. The position is the string index in Text B where YOU BELIEVE the "content" would best fit into Text B. + +STEP 6: Double-check your work. For each item in your "missing_info" list, verify that this information is truly NOT present in any form in Text B. + - If a distinct detail is not present in Text B then include it in the list. + - If an implied fact thats not entirely obviously implied is not present in Text B then include it in the list. + - Don't overthink things, if something is FLAT OUT NOT MENTIONED at all in Text B then include it in your list. + +STEP 7: If NO information in Text A is unique (not in Text A), return exactly: {"missing_info": []} -The output lists should contain as many entries as needed to cover ALL information discrepancies. +Your output must be ONLY valid JSON with no other text, comments, or explanations. +YOUR RESPONSE MUST BE IN JSON FORMAT, DO NOT GIVE ANY OTHER RESPONSE THAT DOES NOT FOLLOW THE FORMAT STRUCTURE OF THE JSON OBJECTS GIVEN IN THE OUTPUT FORMAT EXAMPLES. diff --git a/fastapi/app/prompts/second_pass.bak b/fastapi/app/prompts/second_pass.bak new file mode 100644 index 0000000..3847339 --- /dev/null +++ b/fastapi/app/prompts/second_pass.bak @@ -0,0 +1,104 @@ +Example Outputs: +{ + "extra_info": [ + { + "content": "", + "position": "" + }, + { + "content": "", + "position": "" + }, + ... + ] +} +---------- +{ + "extra_info": [ + { + "content": "", + "position": "" + }, + { + "content": "", + "position": "" + }, + { + "content": "", + "position": "" + }, + { + "content": "", + "position": "" + }, + ... + ] +} +---------- +{ + "extra_info": [ + { + "content": "", + "position": "" + }, + { + "content": "", + "position": "" + }, + ... + ] +} +--------------------- +{ + "extra_info": [ + { + "content": "", + "position": "" + }, + { + "content": "", + "position": "" + }, + { + "content": "", + "position": "" + }, + { + "content": "", + "position": "" + }, + ... + ] +} +--------------------- +{ + "extra_info": [] +} +--------------------- + + +You are a specialized semantic comparison system (known for incredible accuracy) designed to analyze two texts (Text A and Text B) that cover similar topics. Your sole task is to identify ALL information present in Text B that is NOT present in Text A. These texts cover similar topics but may or may not contain different details. + +Follow these steps to ensure maximum accuracy: + +STEP 1: Carefully read and process both texts in their ENTIRETY. + +STEP 2: Break down Text B into distinct semantic units or claims. A semantic unit is a discrete piece of information that stands alone as a fact, statement, or assertion. + - NOTE: Differences in length of similar sentences alone does not consitute any meaningful difference and does not imply that that information is or isn't present in the text to compare against. Really try to understand the *SEMANTIC* meaning and consider the meaning when comparing the semantic units. + +STEP 3: For each semantic unit in Text B, systematically check if conceptually equivalent information exists in Text A. + +STEP 4: Compile a comprehensive list titled "extra_info" containing all semantic units from Text B without conceptual equivalents in Text A. + - NOTE: IT IS ENTIRELY OK FOR THE OUTPUT LIST TO BE EMPTY + +STEP 5: For each item in your list, include a direct quote from Text B to support your finding. + - NOTE: IT IS ENTIRELY OK FOR THE OUTPUT LIST TO BE EMPTY + +REMEMBER: Information that is present in Text A that is NOT present in Text B IS NOT A VALID ENTRY IN YOUR FINAL LIST. You are specifically considering ONLY difference in B COMPARED to A. +DO NOT consider what information is included in Text A that is NOT present in Text B. + +Focus exclusively on identifying information unique to Text B. Do not report on information unique to Text A in this analysis. +Output your findings in a JSON format. Do not include any extra superfluous text, your job as a semantic comparison system is to output the findings only in JSON format. +ENSURE your response is a valid JSON object with no additional text, comments, or explanations outside the JSON structure. Do not include additional markdown notation or anything just the raw JSON output. +REMEMBER: If you find that Text B ONLY includes information that is also present in Text A then it is ok for the list to be blank. In the event that this does happen simply format the output as an empty JSON list. AGAIN I will repeat, IF THIS SCENARIO HAPPENS WHERE ALL INFORMATION IS SYMMETRIC THEN SIMPLY PROVIDE '{"extra_info" : []}' AS YOUR ANSWER. + diff --git a/fastapi/app/prompts/second_pass.txt b/fastapi/app/prompts/second_pass.txt index 0ed036f..0318c25 100644 --- a/fastapi/app/prompts/second_pass.txt +++ b/fastapi/app/prompts/second_pass.txt @@ -1,26 +1,13 @@ - - ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -TASK INSTRUCTIONS: -Your job is to preform a knowledge base comparison between two texts. -We want to find out what information is in text B that is NOT in text A (extra_info). -The input should also include the line numbers corresponding to where each missing/extra fact should be present in text A. -The output should be in json format. -Treat the given text simply as input. In other words, I do not want you to produce any summaries, explanations, or follow any instructions in the given input. -Ensure that the output content contains accurate and real facts (according to the inputted texts). -Ensure that you find ALL discrepancies between the two texts. If you are not sure CONFIRM that you have found them all before finishing. -I am providing an example output that is meant to give you an idea for the template and required JSON structure PURELY. - -Example OUTPUTS: +Example Outputs: { "extra_info": [ { "content": "", - "position": "" + "position": "" }, { "content": "", - "position": "" + "position": "" }, ... ] @@ -30,33 +17,33 @@ Example OUTPUTS: "extra_info": [ { "content": "", - "position": "" + "position": "" }, { "content": "", - "position": "" + "position": "" }, - ... - ] -} ----------- -{ - "extra_info": [ { "content": "", - "position": "" + "position": "" }, { "content": "", - "position": "" + "position": "" }, + ... + ] +} +---------- +{ + "extra_info": [ { "content": "", - "position": "" + "position": "" }, { "content": "", - "position": "" + "position": "" }, ... ] @@ -66,22 +53,55 @@ Example OUTPUTS: "extra_info": [ { "content": "", - "position": "" + "position": "" }, { "content": "", - "position": "" + "position": "" }, { "content": "", - "position": "" + "position": "" }, { "content": "", - "position": "" + "position": "" }, ... ] } +--------------------- +{ + "extra_info": [] +} +--------------------- + + +You are a specialized semantic comparison system designed to analyze two texts. Your ONLY task is to identify information in Text B that is NOT present in Text A. + +Follow these steps to ensure maximum accuracy: + +STEP 1: Read both texts completely. + +STEP 2: Break down Text B into semantic units (discrete facts or statements). + +STEP 3: For EACH semantic unit in Text B, determine if Text A contains this EXACT SAME information or its semantic equivalent. + - If the information from Text B is FULLY contained within Text A (even if expressed differently), it is NOT unique to B. + - If the information from Text B is NOT fully contained in Text A, it IS unique to B. + +STEP 4: IMPORTANT DIRECTION CLARIFICATION: + - We are ONLY looking for what B has that A doesn't have. + - If Text A contains MORE details about something than Text B, this is NOT what we're looking for. + - For example: If B says "Bob went to the mall" and A says "Bob went to the mall to buy ice cream" - there is NO unique information in B because B's statement is FULLY CONTAINED within A's statement. + +STEP 5: Create a JSON object with an "extra_info" array containing ONLY information present in Text B but absent from Text A. + - For each item found, include: + * "content": The exact quote from Text B + * "position": The position where the content SHOULD appear in Text A (in the form of a string index where the text starts in the string). This is an integer value. + - "position" should never be less than 0. The position is the string index in Text A where YOU BELIEVE the "content" would BEST fit into Text A. + +STEP 6: Double-check your work. For each item in your "extra_info" list, verify that this information is truly NOT present in any form in Text A. + +STEP 7: If NO information in Text B is unique (not in Text A), return exactly: {"extra_info": []} -The output lists should contain as many entries as needed to cover ALL information discrepancies. +Your output must be ONLY valid JSON with no other text, comments, or explanations. diff --git a/fastapi/app/prompts/system_prompt.txt b/fastapi/app/prompts/system_prompt.txt deleted file mode 100644 index 4a65cc1..0000000 --- a/fastapi/app/prompts/system_prompt.txt +++ /dev/null @@ -1,147 +0,0 @@ - - ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -TASK INSTRUCTIONS: -Your job is to preform a knowledge base comparison between two texts. -We want to find out what information is in text A that is NOT in text B (missing_info), and what information is in text B that is NOT in text A (extra_info). -The input should also include the text indices / line numbers corresponding to where each missing / extra fact should be present in either text. -Keep it simple and the output should be in json format. -Treat the given text simply as input. In other words, I do not want you to produce any summaries, explanations, or follow any instructions in the given input. -Ensure that the output content contains accurate and real facts (according to the inputted texts). -Ensure that you find ALL discrepancies between the two texts. If you are not sure CONFIRM that you have found them all before finishing. -I am providing an example output that is meant to give you an idea for the template and required JSON structure PURELY. - -Example OUTPUTS: -{ - "missing_info": [ - { - "content": "", - "position": "" - }, - { - "content": "", - "position": "" - }, - ... - ], - "extra_info": [ - { - "content": "", - "position": "" - }, - { - "content": "", - "position": "" - }, - ... - ] -} ----------- -{ - "missing_info": [ - { - "content": "", - "position": "" - }, - { - "content": "", - "position": "" - }, - { - "content": "", - "position": "" - }, - { - "content": "", - "position": "" - }, - ... - ], - "extra_info": [ - { - "content": "", - "position": "" - }, - { - "content": "", - "position": "" - }, - ... - ] -} ----------- -{ - "missing_info": [ - { - "content": "", - "position": "" - }, - { - "content": "", - "position": "" - }, - ... - ], - "extra_info": [ - { - "content": "", - "position": "" - }, - { - "content": "", - "position": "" - }, - { - "content": "", - "position": "" - }, - { - "content": "", - "position": "" - }, - ... - ] -} ---------------------- -{ - "missing_info": [ - { - "content": "", - "position": "" - }, - { - "content": "", - "position": "" - }, - { - "content": "", - "position": "" - }, - { - "content": "", - "position": "" - }, - ... - ], - "extra_info": [ - { - "content": "", - "position": "" - }, - { - "content": "", - "position": "" - }, - { - "content": "", - "position": "" - }, - { - "content": "", - "position": "" - }, - ... - ] -} - -The output lists should contain as many entries as needed to cover ALL information discrepancies. diff --git a/fastapi/app/testdata/obama_A.txt b/fastapi/app/testdata/obama_A.txt index faa1edd..0876b94 100644 --- a/fastapi/app/testdata/obama_A.txt +++ b/fastapi/app/testdata/obama_A.txt @@ -1,4 +1,4 @@ -Barack Hussein Obama II (born August 4, 1961) is an American politician who served as the 44th president of the United States from 2009 to 2017. +Barack Hussein Obama II is an American politician who served as the 44th president of the United States from 2009 to 2017. Obama previously served as a U.S. senator representing Illinois from 2005 to 2008 and as an Illinois state senator from 1997 to 2004. Obama was born in Honolulu, Hawaii. He graduated from Columbia University in 1983 with a Bachelor of Arts degree in political science and later worked as a community organizer in Chicago. diff --git a/fastapi/app/testdata/obama_B.txt b/fastapi/app/testdata/obama_B.txt index 43bdf2c..df27747 100644 --- a/fastapi/app/testdata/obama_B.txt +++ b/fastapi/app/testdata/obama_B.txt @@ -1,4 +1,4 @@ -Barack Hussein Obama II (born August 4, 1961) is an American politician who served as the 44th president of the United States from 2009 to 2017. +Barack Hussein Obama II is an American politician who served as the 44th president of the United States from 2009 to 2017. A member of the Democratic Party, he was the first African-American president in American history. He graduated from Columbia University in 1983 with a Bachelor of Arts degree in political science and later worked as a community organizer in Chicago. In 1988, Obama enrolled in Harvard Law School, where he was the first black president of the Harvard Law Review. From a28d4567c8c8934fb4eb402d89a4754218ab8ebf Mon Sep 17 00:00:00 2001 From: AlekSimpson Date: Tue, 23 Sep 2025 13:35:33 -0700 Subject: [PATCH 2/4] stashing --- fastapi/app/main.py | 83 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 71 insertions(+), 12 deletions(-) diff --git a/fastapi/app/main.py b/fastapi/app/main.py index 41ae029..28c256e 100644 --- a/fastapi/app/main.py +++ b/fastapi/app/main.py @@ -26,13 +26,34 @@ ''' -comparison_models = [ - "sentence-transformers/LaBSE", - "xlm-roberta-base", - "multi-qa-distilbert-cos-v1", - "multi-qa-MiniLM-L6-cos-v1", - "multi-qa-mpnet-base-cos-v1" -] +class ArticleDeconstruct: + # todo: make these dicts and have the keys be unique ids for each section so that we can iterate in order + text_sections: list[str] = [] + media_sections: list[str] = [] # list of image url links for markdown + tabular: dict = {} + +class BackendDataStore: + comparison_models: list[str] = [ + "sentence-transformers/LaBSE", + "xlm-roberta-base", + "multi-qa-distilbert-cos-v1", + "multi-qa-MiniLM-L6-cos-v1", + "multi-qa-mpnet-base-cos-v1" + ] + selected_model: str = comparison_models[0] + article_deconstruct: ArticleDeconstruct = None + + # todo: expose these functions to the api + def set_selected_model(self, choice: str) -> bool: + if choice not in comparison_models: + return False + + selected_model = choice + return True + + def available_models_list(self): + return comparison_models + # Configure logging logging.basicConfig( @@ -61,9 +82,11 @@ class ArticleComparisonResponse(BaseModel): extra_info: List # Class defines the API reponse format for source article (output) -class TranslateArticleResponse(BaseModel): +class ArticleResponse(BaseModel): translated_article: str + + wiki_wiki = wikipediaapi.Wikipedia(user_agent='MyApp/2.0 (contact@example.com)', language='en') # English Wikipedia instance # Function to get the URL of Wikipedia page from title as input @@ -128,7 +151,7 @@ def get_article(url: str = Query(None), title: str = Query(None)): return {"source_article": article_content, "article_languages": languages} -@app.get("/wiki_translate/source_article", response_model=TranslateArticleResponse) +@app.get("/wiki_translate/source_article", response_model=ArticleResponse) def translate_article(url: str = Query(None), title: str = Query(None), language: str = Query(...)): logging.info(f"Calling translate article endpoint for title: {title}, url: {url} and language: {language}") @@ -176,10 +199,46 @@ def compare_articles(text_a: str, text_b: str, similarity_threshold: float = 0.7 # missing_info, extra_info = perform_semantic_comparison(text_a, text_b, similarity_threshold, model_name) # return {"missing_info": missing_info, "extra_info": extra_info} + output = llm_semantic_comparison(text_a, text_b) - x = {"missing_info": output['missing_info'], "extra_info": output['extra_info']} - print(x) - return x + # response = {"missing_info": output['missing_info'], "extra_info": output['extra_info']} + # return response + return output + +@app.get("/synthesis/full", response_model=ArticleResponse) +def synthesize_full_article(target_language: str, article_a: str, article_b: str, article_synth_base: int): + if article_synth_base < 0 or article_synth_base >= 2: + raise HTTPException(status_code=400, detail="article_synth_base must be 0 or 1") + # todo: check if language target is supported + + deconstruct_a = deconstruct_article(article_a) + deconstruct_b = deconstruct_article(article_b) + + target_base = deconstruct_a if article_synth_base == 0 else deconstruct_b + comp_base = deconstruct_b if article_synth_base == 0 else deconstruct_a + + missing = {} + extra = {} + + synthesis = [] + + # todo: this full comparison code should be in the sem. comparison (underlying function) endpoint also + for id, text_a in enumerate(target_base.text): + for text_b in comp_base.text: # use multiprocessing + output = llm_semantic_comparison(text_a, text_b) + missing[id] = output['missing_info'] + extra[id] = output['extra_info'] + + for id, text in enumerate(target_base.text): + # synthesize paragraph + para_synth = synthesize_paragraph(text, missing[id]) + # add to full synthesis + synthesis.append(para_synth) + + synthesis.extend(target_base.media) + synthesis.extend(target_base.tabular) + + return synthesis if __name__ == '__main__': From d475137c4b8f6b1ec9d56994057bec8207d869d9 Mon Sep 17 00:00:00 2001 From: AlekSimpson Date: Thu, 2 Oct 2025 17:50:10 -0700 Subject: [PATCH 3/4] article deconstruction function works, added central backend data store, added ArticleModel model, among other things --- __pycache__/__init__.cpython-310.pyc | Bin 0 -> 160 bytes backend_requirements.txt | 198 +++++++++++++++++++++++++ fastapi/app/ai/synthesis.py | 192 +++++++++++++++++++++++++ fastapi/app/ai/text_recon.py | 208 --------------------------- fastapi/app/main.py | 22 +-- setup_instructions.md | 13 ++ 6 files changed, 416 insertions(+), 217 deletions(-) create mode 100644 __pycache__/__init__.cpython-310.pyc create mode 100644 backend_requirements.txt create mode 100644 fastapi/app/ai/synthesis.py delete mode 100644 fastapi/app/ai/text_recon.py create mode 100644 setup_instructions.md diff --git a/__pycache__/__init__.cpython-310.pyc b/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9b7afce632413794a64e24dae36582cce3abe7b7 GIT binary patch literal 160 zcmd1j<>g`kf~_oW^O@oexANdYH@Z+et~{LQGQlxa!Ij%0ED3%T$!7jT2fT0>*%Q;AD@|*SrQ+w ZS5SG2!zMRBr8Fni4rF*S6OdqG007=_CT{=$ literal 0 HcmV?d00001 diff --git a/backend_requirements.txt b/backend_requirements.txt new file mode 100644 index 0000000..173aac2 --- /dev/null +++ b/backend_requirements.txt @@ -0,0 +1,198 @@ +# This file may be used to create an environment using: +# $ conda create --name --file +# platform: osx-64 +aiohttp=3.8.6=pypi_0 +aiosignal=1.3.1=pypi_0 +allennlp=2.10.1=pypi_0 +allennlp-models=2.10.1=pypi_0 +annotated-types=0.5.0=pypi_0 +anyio=3.7.1=pyhd8ed1ab_0 +async-timeout=4.0.3=pypi_0 +asynctest=0.13.0=pypi_0 +attrs=24.2.0=pypi_0 +base58=2.1.1=pypi_0 +blis=0.2.4=pypi_0 +boto3=1.33.13=pypi_0 +botocore=1.33.13=pypi_0 +brotli-python=1.0.9=py37h0582d14_7 +ca-certificates=2025.8.3=hbd8a1cb_0 +cached-path=1.1.6=pypi_0 +cached-property=1.5.2=pypi_0 +cachetools=5.5.2=pypi_0 +certifi=2024.8.30=pyhd8ed1ab_0 +cffi=1.15.1=py37h7346b73_1 +charset-normalizer=3.4.1=pypi_0 +click=8.1.8=pypi_0 +colorama=0.4.6=pyhd8ed1ab_0 +commonmark=0.9.1=pypi_0 +conllu=4.4.2=pypi_0 +coreftools=1.1.1=pypi_0 +cymem=2.0.11=pypi_0 +dataclasses=0.8=pyhc8e2a94_3 +datasets=2.10.1=pypi_0 +dill=0.3.6=pypi_0 +distro=1.9.0=pypi_0 +docker-pycreds=0.4.0=pypi_0 +en-core-web-sm=2.1.0=pypi_0 +exceptiongroup=1.2.2=pypi_0 +fairscale=0.4.6=pypi_0 +fastapi=0.103.2=pyhd8ed1ab_0 +filelock=3.7.1=pypi_0 +freetype=2.13.3=h40dfd5c_0 +frozenlist=1.3.3=pypi_0 +fsspec=2023.1.0=pyhd8ed1ab_0 +ftfy=6.1.1=pypi_0 +gitdb=4.0.12=pypi_0 +gitpython=3.1.44=pypi_0 +google-api-core=2.24.2=pypi_0 +google-auth=2.38.0=pypi_0 +google-cloud-core=2.4.3=pypi_0 +google-cloud-storage=2.19.0=pypi_0 +google-crc32c=1.5.0=pypi_0 +google-resumable-media=2.7.2=pypi_0 +googleapis-common-protos=1.69.2=pypi_0 +h11=0.14.0=pypi_0 +h5py=3.8.0=pypi_0 +httpcore=0.17.3=pypi_0 +httpx=0.24.1=pypi_0 +huggingface-hub=0.10.1=pypi_0 +huggingface_hub=0.16.4=pyhd8ed1ab_0 +idna=3.10=pyhd8ed1ab_0 +importlib-metadata=6.7.0=pypi_0 +importlib_metadata=4.11.4=hd8ed1ab_0 +iniconfig=2.0.0=pypi_0 +jmespath=1.0.1=pypi_0 +joblib=1.3.2=pyhd8ed1ab_0 +jpeg=9e=hb7f2c08_3 +jsonnet=0.20.0=pypi_0 +jsonschema=2.6.0=pypi_0 +lcms2=2.14=h90f4b2a_0 +lerc=4.0.0=hb486fe8_0 +libblas=3.9.0=20_osx64_openblas +libcblas=3.9.0=20_osx64_openblas +libcxx=20.1.3=hf95d169_0 +libdeflate=1.14=hb7f2c08_0 +libffi=3.4.4=hecd8cb5_1 +libgfortran=14.2.0=hef36b68_105 +libgfortran5=14.2.0=h58528f3_105 +libhwloc=2.11.2=default_h4cdd727_1001 +libiconv=1.18=h4b5e92a_1 +liblapack=3.9.0=20_osx64_openblas +liblzma=5.8.1=hd471939_0 +libopenblas=0.3.25=openmp_hfef2a42_0 +libpng=1.6.47=h3c4a55f_0 +libprotobuf=3.21.12=h7d26f99_2 +libsqlite=3.45.2=h92b6c6a_0 +libtiff=4.4.0=h6268bbc_5 +libwebp-base=1.5.0=h6cf52b4_0 +libxcb=1.13=h0d85af4_1004 +libxml2=2.13.7=h3fbc333_1 +libzlib=1.3.1=hd23fc13_2 +llvm-openmp=20.1.3=ha54dae1_0 +lmdb=1.6.2=pypi_0 +mkl=2022.2.1=h44ed08c_16952 +more-itertools=9.1.0=pypi_0 +multidict=6.0.5=pypi_0 +multiprocess=0.70.14=pypi_0 +murmurhash=1.0.12=pypi_0 +ncurses=6.4=hcec6c5f_0 +neuralcoref=4.0=pypi_0 +ninja=1.12.1=h3c5361c_0 +nltk=3.8.1=pyhd8ed1ab_0 +numpy=1.21.6=py37h345d48f_0 +ollama=0.6.7=cpu_h77ccaa4_ +openai=1.39.0=pypi_0 +openjpeg=2.5.0=h5d0d7b0_1 +openssl=3.5.3=h230baf5_1 +packaging=24.0=pypi_0 +pandas=1.3.5=pypi_0 +pathtools=0.1.2=pypi_0 +pillow=9.5.0=pypi_0 +pip=22.3.1=py37hecd8cb5_0 +plac=0.9.6=pypi_0 +pluggy=1.2.0=pypi_0 +preshed=2.0.1=pypi_0 +promise=2.3=pypi_0 +proto-plus=1.26.1=pypi_0 +protobuf=3.20.0=pypi_0 +psutil=7.0.0=pypi_0 +pthread-stubs=0.4=h00291cd_1002 +py-rouge=1.1=pypi_0 +pyarrow=12.0.1=pypi_0 +pyasn1=0.5.1=pypi_0 +pyasn1-modules=0.3.0=pypi_0 +pycparser=2.21=pyhd8ed1ab_0 +pydantic=2.5.3=pypi_0 +pydantic-core=2.14.6=pypi_0 +pydot=2.0.0=pypi_0 +pygments=2.17.2=pypi_0 +pyparsing=3.1.4=pypi_0 +pysocks=1.7.1=py37hf985489_5 +pytest=7.4.4=pypi_0 +python=3.7.12=hf3644f1_100_cpython +python-dateutil=2.9.0.post0=pypi_0 +python_abi=3.7=4_cp37m +pytorch=1.12.1=cpu_py37h3bab975_1 +pytz=2025.2=pypi_0 +pyyaml=6.0.1=pypi_0 +readline=8.2=hca72f7f_0 +regex=2024.4.16=pypi_0 +requests=2.31.0=pypi_0 +responses=0.18.0=pypi_0 +rich=12.6.0=pypi_0 +rsa=4.9=pypi_0 +s3transfer=0.8.2=pypi_0 +sacremoses=0.0.53=pyhd8ed1ab_0 +scikit-learn=1.0.2=py37h572704e_0 +scipy=1.7.3=py37h4e3cf02_0 +sentence-transformers=2.2.2=pyhd8ed1ab_0 +sentencepiece=0.2.0=pypi_0 +sentry-sdk=2.25.1=pypi_0 +setproctitle=1.3.3=pypi_0 +setuptools=65.6.3=py37hecd8cb5_0 +shellingham=1.5.4=pypi_0 +shortuuid=1.0.13=pypi_0 +six=1.17.0=pypi_0 +sleef=3.8=hfe0d17b_0 +smmap=5.0.2=pypi_0 +sniffio=1.3.1=pypi_0 +spacy=2.1.0=pypi_0 +sqlite=3.45.2=h7461747_0 +srsly=1.0.7=pypi_0 +stanford-openie=1.3.2=pypi_0 +stanfordcorenlp=3.9.1.1=pypi_0 +stanfordnlp=0.2.0=pypi_0 +starlette=0.27.0=pyhd8ed1ab_0 +tbb=2021.13.0=hb890de9_1 +tensorboardx=2.6.2.2=pypi_0 +termcolor=1.1.0=pypi_0 +thinc=7.0.8=pypi_0 +threadpoolctl=3.1.0=pyh8a188c0_0 +tk=8.6.13=h1abcd95_1 +tokenizers=0.12.1=pypi_0 +tomli=2.0.1=pypi_0 +torchvision=0.13.1=pypi_0 +tqdm=4.67.1=pyhd8ed1ab_0 +traitlets=5.9.0=pypi_0 +transformers=4.20.1=pypi_0 +typer=0.15.2=pypi_0 +typing-extensions=4.7.1=hd8ed1ab_0 +typing_extensions=4.7.1=pyha770c72_0 +urllib3=1.26.20=pypi_0 +uvicorn=0.19.0=py37hf985489_0 +wandb=0.12.21=pypi_0 +wasabi=0.10.1=pypi_0 +wcwidth=0.2.13=pypi_0 +wget=3.2=pypi_0 +wheel=0.38.4=py37hecd8cb5_0 +wikipedia-api=0.6.0=pyhd8ed1ab_0 +word2number=1.1=pypi_0 +xorg-libxau=1.0.12=h6e16a3a_0 +xorg-libxdmcp=1.1.5=h00291cd_0 +xxhash=3.5.0=pypi_0 +xz=5.6.4=h46256e1_1 +yaml=0.2.5=h0d85af4_2 +yarl=1.9.4=pypi_0 +zipp=3.15.0=pyhd8ed1ab_0 +zlib=1.3.1=hd23fc13_2 +zstd=1.5.7=h8210216_2 diff --git a/fastapi/app/ai/synthesis.py b/fastapi/app/ai/synthesis.py new file mode 100644 index 0000000..0e4db01 --- /dev/null +++ b/fastapi/app/ai/synthesis.py @@ -0,0 +1,192 @@ +import requests +from html_to_markdown import convert_to_markdown +import re +from typing import List, Tuple +from dataclasses import dataclass, field + +TITLE = 0 +TEXT = 1 +MEDIA = 2 +TABULAR = 3 +REFERENCES = 4 + +@dataclass +class TextFragment: + text: str + missing_info: List[str] = field(default_factory=list) + +@dataclass +class ArticleModel: + titles: list[str] = field(default_factory=list) + text: list[TextFragment] = field(default_factory=list) + media: list[str] = field(default_factory=list) + tabular: list[str] = field(default_factory=list) + references: list[str] = field(default_factory=list) + + structure: List[Tuple[int, int]] = field(default_factory=list) + + def add_to_section(self, section_type: int, section_content: str): + idx = 0 + if section_type == 0: + self.titles.append(section_content) + idx = len(self.titles)-1 + elif section_type == 1: + self.text.append(TextFragment(text=section_content)) + idx = len(self.text)-1 + elif section_type == 2: + self.media.append(section_content) + idx = len(self.media)-1 + elif section_type == 3: + self.tabular.append(section_content) + idx = len(self.tabular)-1 + elif section_type == 4: + self.references.append(section_content) + idx = len(self.references)-1 + self.structure.append((section_type, idx)) + +def html_to_md(page_name): + params = { + "action": "parse", + "page": page_name, + "prop": "text", + "format": "json", + "formatversion": 2, + "redirects": 1 + } + + r = requests.get("https://en.wikipedia.org/w/api.php", params=params, headers={"User-Agent": "YourAppName/1.0 (contact@example.com)"}, timeout=30) + html = r.json()["parse"]["text"] + markdown = convert_to_markdown(html) + + return markdown + +def create_article_model(md_content: str) -> ArticleModel: + def html_image_to_markdown(html_img): + src_match = re.search(r'src=["\']([^"\']+)["\']', html_img) + alt_match = re.search(r'alt=["\']([^"\']*)["\']', html_img) + + if not src_match: + return html_img + + src = src_match.group(1) + alt = alt_match.group(1) if alt_match else '' + + return f'![{alt}]({src})' + + def is_wiki_reference(text: str) -> bool: + pattern = r'^\d+\.\s+(\*\*\[\^.*?\]\(#cite_ref-.*?\)\*\*|\^\s+\[)' + return bool(re.match(pattern, text.strip(), re.DOTALL)) + + def strip_wiki_links(text: str) -> str: + wiki_link_pattern = re.compile( + r'\[([^\]]+)\]\(\s*(?:[^)\s]*?/wiki/[^)\s]*)(?:\s+"[^"]*")?\s*\)' + ) + return wiki_link_pattern.sub(r'\1', text) + + def remove_inline_citations(text: str) -> str: + citation_pattern = re.compile( + r'\[*\\\[\s*\d+\s*\]\s*\]\(\s*#cite_note-\d+(?:-[^)]+)?\s*\)' + ) + return citation_pattern.sub('', text) + + def is_table_row(text: str) -> bool: + pattern = r'^\|.*\|$|^[\|\s]*[-:]+[\|\s\-:]*$' + return bool(re.match(pattern, text)) + + def is_image(text: str) -> bool: + pattern = r']*?src=["\'].*?["\'][^>]*?/?>' + return bool(re.match(pattern, text)) + + def remove_wiki_edit_links(text): + pattern = r'\[\[edit\]\([^)]+\)\]' + return re.sub(pattern, '', text) + + model = ArticleModel() + if not md_content: + return model + + # clean article first + content = strip_wiki_links(md_content) + content = remove_inline_citations(content) + content = content.replace("\\", "") # todo: remove '\' characters + content = remove_wiki_edit_links(content) # todo: remove wiki [edit] links + + # process references first + refs_heading_pattern = re.compile( + r'(?m)^(?:#{1,6}\s*References\s*$|References\s*\n[-=]{3,}\s*$)' + ) + refstart = refs_heading_pattern.search(content) + references_str_raw = content[refstart.start():len(content)] + + for ref in references_str_raw.split("\n"): + if is_wiki_reference(ref): + model.add_to_section(REFERENCES, ref.strip()) + + # process rest of article + article_content = content[0:refstart.start()] + article_lines = article_content.split("\n") + line_idx = 1 + article_end = len(article_lines)-1 + + def peek(idx): + if idx == article_end: + return "" + return article_lines[idx+1] + + def parse_table(start_idx): + table_str = article_lines[start_idx] + "\n" + idx = start_idx+1 + while idx <= article_end and is_table_row(article_lines[idx]): + table_str += article_lines[idx] + "\n" + idx += 1 + + model.add_to_section(TABULAR, table_str) + return idx + + def parse_image(start_idx): + image_str = article_lines[start_idx] + "\n" + idx = start_idx+1 + nextline = article_lines[idx] + while nextline.strip() == "": + idx+=1 + nextline = article_lines[idx] + + if nextline.startswith("*") and nextline.endswith("*"): + image_str += nextline + "\n" + + model.add_to_section(MEDIA, image_str) + return idx+1 + + while line_idx <= article_end: + if article_lines[line_idx] == "" or article_lines[line_idx].isspace(): + line_idx+=1 + continue + + if is_table_row(article_lines[line_idx]): + line_idx = parse_table(line_idx) + continue + elif is_image(article_lines[line_idx]): + line_idx = parse_image(line_idx) + continue + + if peek(line_idx).startswith("-"): + model.add_to_section(TITLE, article_lines[line_idx]) + line_idx += 2 + continue + + # else its text + model.add_to_section(TEXT, article_lines[line_idx]) + line_idx += 1 + + for x in model.tabular: + print("-"*50) + print(x) + + return model + + +# article_titles = ["Pet door", "Owner-occupancy"] +# +# md = html_to_md(article_titles[1]) +# model = create_article_model(md) + diff --git a/fastapi/app/ai/text_recon.py b/fastapi/app/ai/text_recon.py deleted file mode 100644 index 91be41d..0000000 --- a/fastapi/app/ai/text_recon.py +++ /dev/null @@ -1,208 +0,0 @@ -import requests -from html_to_markdown import convert_to_markdown -import re -from typing import List, Tuple - -def html_to_md(page_name): - params = { - "action": "parse", - "page": "Pet door", #replace - "prop": "text", - "format": "json", - "formatversion": 2, - "redirects": 1 - } - r = requests.get("https://en.wikipedia.org/w/api.php", params=params, headers={"User-Agent": "YourAppName/1.0 (contact@example.com)"}, timeout=30) - html = r.json()["parse"]["text"] - out_path = "page.html" - with open(out_path, "w", encoding="utf-8") as f: - f.write(html) - markdown = convert_to_markdown(html) - md_pth = "page.md" - with open(md_pth, "w", encoding="utf-8") as f: - f.write(markdown) - - return markdown - - -import re -from typing import List, Tuple - - -import re -from typing import List, Tuple - -def split_markdown_advanced(content: str) -> Tuple[List[str], List[str], List[str], List[str]]: - """ - Returns: - - image_lines: full lines like [](...) captured verbatim - - tables: raw Markdown table blocks - - paragraphs: remaining paragraphs after filtering - - references_blocks: a list with one element containing the entire References section as a single string - (or empty if not found), with '/wiki/' links stripped to plain text and edit line removed - - Behaviors: - - Remove any Markdown links whose URL contains '/wiki/' (preserve label). - - Remove any line that starts with '[[edit]'. - - Remove inline citation anchors like '[[5]](#cite_note-5)' or '[[12]](#cite_note-12)'. - - Image lines that start with '[](...)' are returned in image_lines and removed from paragraphs. - - References section (from its heading to before the next heading or EOF) is extracted and returned as one string. - """ - if not content: - return [], [], [], [] - - original = content - - - - # Utility: strip wiki links "[label](/wiki/...)" -> "label" - wiki_link_pattern = re.compile( - r'\[([^\]]+)\]\(\s*(?:[^)\s]*?/wiki/[^)\s]*)(?:\s+"[^"]*")?\s*\)' - ) - def strip_wiki_links(text: str) -> str: - return wiki_link_pattern.sub(r'\1', text) - - # Utility: remove inline citation links of the form [[5]](#cite_note-5) or [[7]](#cite_note-7) - # Be lenient about the fragment suffix: #cite_note-7, #cite_note-7-0, etc. - citation_pattern = re.compile( - r'\[*\\\[\s*\d+\s*\]\s*\]\(\s*#cite_note-\d+(?:-[^)]+)?\s*\)' - ) - def remove_inline_citations(text: str) -> str: - return citation_pattern.sub('', text) - - # 1) Extract the References section first, so it can be returned verbatim (with requested cleanups). - refs_heading_pattern = re.compile( - r'(?m)^(?:#{1,6}\s*References\s*$|References\s*\n[-=]{3,}\s*$)' - ) - references_blocks: List[str] = [] - text = original - - m = refs_heading_pattern.search(text) - if m: - start = m.start() - # Find end: next heading (ATX or Setext) after start, else EOF - next_heading_pattern = re.compile( - r'(?m)^(?:#{1,6}\s*\S.*$|[^\n]+\n[-=]{3,}\s*$)' - ) - next_m = next_heading_pattern.search(text, m.end()) - end = next_m.start() if next_m else len(text) - refs_raw = text[start:end] - - # Inside references: remove the edit line and inline citation anchors, strip /wiki/ links - refs_lines = [] - for line in refs_raw.splitlines(): - if line.lstrip().startswith('[[edit]'): - continue - refs_lines.append(line) - refs_clean = "\n".join(refs_lines) - refs_clean = remove_inline_citations(refs_clean) - refs_clean = strip_wiki_links(refs_clean) - - # Keep as a single string - refs_clean = refs_clean.strip() - if refs_clean: - references_blocks.append(refs_clean) - - # Remove the references block from main text - text = text[:start] + text[end:] - - # 3) Extract full image lines of the form [](...) - image_lines: List[str] = [] - remaining_lines = [] - img_line_pattern = re.compile(r'^\s*\[\s*]*>\s*\]\([^)]+\)\s*$', re.IGNORECASE) - for line in text.splitlines(): - if img_line_pattern.match(line): - image_lines.append(line.rstrip()) - continue - remaining_lines.append(line) - text = "\n".join(remaining_lines) - - # 2) Global removals/rewrites on the remaining text - # 2a) Remove any line that starts with '[[edit]' - kept_lines = [] - for line in text.splitlines(): - if line.lstrip().startswith('\\[[edit]'): - continue - kept_lines.append(line) - text = "\n".join(kept_lines) - - # 2b) Remove inline citation anchors like [[5]](#cite_note-5) - text = remove_inline_citations(text) - - # 2c) Strip /wiki/ links globally, preserving label - text = strip_wiki_links(text) - - - - # 4) Extract Markdown tables as blocks - lines = text.splitlines() - tables: List[str] = [] - used_line_idx = set() - - def is_table_sep(line: str) -> bool: - return bool(re.match( - r'^\s*\|?\s*:?-{3,}:?\s*(\|\s*:?-{3,}:?\s*)+\|?\s*$', - line - )) - - i = 0 - while i < len(lines): - if "|" in lines[i]: - j = i + 1 - found_sep = False - while j < len(lines) and (j - i) <= 5 and "|" in lines[j]: - if is_table_sep(lines[j]): - found_sep = True - break - j += 1 - if found_sep: - start = i - end = j + 1 - while end < len(lines) and "|" in lines[end]: - end += 1 - block = "\n".join(lines[start:end]).strip() - if block: - tables.append(block) - for idx in range(start, end): - used_line_idx.add(idx) - i = end - continue - i += 1 - - # 5) Build paragraphs from remaining lines (excluding table lines and blank separators) - paragraphs: List[str] = [] - buf: List[str] = [] - - def flush_buf(): - if buf: - block = "\n".join(buf).strip() - if block: - paragraphs.append(block) - buf.clear() - - for idx, line in enumerate(lines): - if idx in used_line_idx: - flush_buf() - continue - if line.strip() == "": - flush_buf() - else: - buf.append(line) - flush_buf() - - # Remove accidental table separators in paragraphs if any - cleaned_paragraphs = [] - for block in paragraphs: - if any(is_table_sep(l) for l in block.splitlines()): - continue - cleaned_paragraphs.append(block) - - return image_lines, tables, cleaned_paragraphs, references_blocks - -md = html_to_md("") -dedup_images, tables, cleaned_paragraphs, references = split_markdown_advanced(md) - -print(dedup_images) -print(tables) -print(cleaned_paragraphs) -print(references) \ No newline at end of file diff --git a/fastapi/app/main.py b/fastapi/app/main.py index bdfcd73..da9c342 100644 --- a/fastapi/app/main.py +++ b/fastapi/app/main.py @@ -1,11 +1,19 @@ #!/bin/bash -from fastapi import FastAPI +from fastapi import FastAPI, HTTPException import uvicorn import logging from fastapi.middleware.cors import CORSMiddleware -from app.api import wiki_article -from app.model.request import Url +from fastapi import Query +from pydantic import BaseModel +from starlette.config import Config +from starlette.requests import Request +from starlette.responses import JSONResponse +import wikipediaapi +from typing import List + +from .api import wiki_article +from .model.request import Url from .ai.semantic_comparison import perform_semantic_comparison from .ai.llm_comparison import llm_semantic_comparison @@ -23,11 +31,7 @@ ''' -class ArticleDeconstruct: - # todo: make these dicts and have the keys be unique ids for each section so that we can iterate in order - text_sections: list[str] = [] - media_sections: list[str] = [] # list of image url links for markdown - tabular: dict = {} + class BackendDataStore: comparison_models: list[str] = [ @@ -38,7 +42,7 @@ class BackendDataStore: "multi-qa-mpnet-base-cos-v1" ] selected_model: str = comparison_models[0] - article_deconstruct: ArticleDeconstruct = None + article_model: ArticleModel= None # todo: expose these functions to the api def set_selected_model(self, choice: str) -> bool: diff --git a/setup_instructions.md b/setup_instructions.md new file mode 100644 index 0000000..6569520 --- /dev/null +++ b/setup_instructions.md @@ -0,0 +1,13 @@ + +Backend Server Installation and Startup Instructions + +1. First setup your python environment. Conda is the reccomended tool for creating the python environments. + - The command to setup your environment is: `conda create --name --file backend_requirements.txt` + +2. Once that has successfully installed activate the environment with: conda activate + +3. Now navigate to the fastapi/app/ directory + +4. To start the server run: `fastapi dev main.py` + + From 1d31c0c18d4703815be6d3a277811916380c04be Mon Sep 17 00:00:00 2001 From: AlekSimpson Date: Thu, 9 Oct 2025 15:46:58 -0700 Subject: [PATCH 4/4] stashing changes 8/9/2025 --- fastapi/app/ai/SYNTH_DEMO.md | 182 +++++++++++++++++++++++ fastapi/app/ai/synthesis.py | 64 ++++++-- fastapi/app/ai/synthesis_demo.py | 41 +++++ fastapi/app/main.py | 49 +++--- fastapi/app/{test => tests}/__init__.py | 0 fastapi/app/{test => tests}/test_main.py | 0 6 files changed, 297 insertions(+), 39 deletions(-) create mode 100644 fastapi/app/ai/SYNTH_DEMO.md create mode 100644 fastapi/app/ai/synthesis_demo.py rename fastapi/app/{test => tests}/__init__.py (100%) rename fastapi/app/{test => tests}/test_main.py (100%) diff --git a/fastapi/app/ai/SYNTH_DEMO.md b/fastapi/app/ai/SYNTH_DEMO.md new file mode 100644 index 0000000..1083f6e --- /dev/null +++ b/fastapi/app/ai/SYNTH_DEMO.md @@ -0,0 +1,182 @@ +1. **[^](#cite_ref-1)** *Koren, Liran (2022-04-13). ["Owner-Occupied vs. Non-Owner-Occupied Real Estate: What's the Difference?"](https://luxurypropertycare.com/owner-occupied-vs-non-owner-occupied/). *Luxury Property Care*. Retrieved 2023-07-28.* +2. **[^](#cite_ref-2)** *["Public spending on financial support to homebuyers"](https://www.oecd.org/els/family/PH2-1-Public-spending-support-to-home-buyers.pdf) (PDF). *OECD*. 16 December 2019. Retrieved 21 September 2020.* +3. **[^](#cite_ref-3)** *Delouya, Samantha (2024-12-16). ["The median renter in America has a net worth of $10,400. The median homeowner's net worth is $400,000 | CNN Business"](https://www.cnn.com/2024/12/16/economy/renter-homeowner-net-worth-gap). *CNN*. Retrieved 2025-06-16.* +4. **[^](#cite_ref-4)** *["The Advantages of Renting"](https://www.npr.org/templates/story/story.php?storyId=100961300).* +5. **[^](#cite_ref-5)** *["Shelter, or burden?"](https://www.economist.com/briefing/2009/04/16/shelter-or-burden). *The Economist*. 2009-04-16.* +6. **[^](#cite_ref-6)** *Ansell, Ben W. (2019). ["The Politics of Housing"](https://doi.org/10.1146%2Fannurev-polisci-050317-071146). *Annual Review of Political Science*. **22**: 165–185. doi "Doi (identifier)"):[10.1146/annurev-polisci-050317-071146](https://doi.org/10.1146%2Fannurev-polisci-050317-071146).* +7. **[^](#cite_ref-7)** *["What Is the Homeownership Rate?"](https://www.thebalancemoney.com/the-homeownership-rate-what-is-it-and-how-is-it-calculated-4175698). *www.thebalancemoney.com*. 9 November 2021. [Archived](http://www.answers.com/topic/homeownership-rate-1) from the original on 9 October 2024. Retrieved 9 October 2024.* +8. **[^](#cite_ref-NARre2024_8-0)** Exhibit 1–1 by *NAR Research Staff (2025). ["Profile of Home Buyers and Sellers 2024"](https://www.lirealtor.com/docs/default-source/default-document-library/2024_profile_of_home_buyers_and_sellers_report_final.pdf) (PDF). National Association or Realtors. p. 10. [Archived](https://web.archive.org/web/20250314124228/https://www.lirealtor.com/docs/default-source/default-document-library/2024_profile_of_home_buyers_and_sellers_report_final.pdf) (PDF) from the original on March 14, 2025.* +9. ^ [***a***](#cite_ref-:0_9-0) [***b***](#cite_ref-:0_9-1) [Housing Finance Information Network (HOFINET)](http://hofinet.org) +10. **[^](#cite_ref-10)** *["Homeownership rate in selected European countries in 2023, by country"](https://www.statista.com/statistics/246355/home-ownership-rate-in-europe/). *Statista*. 5 September 2024. Retrieved 9 October 2024.* +11. **[^](#cite_ref-11)** *["Síntese de Indicadores Sociais"](https://www.ibge.gov.br/estatisticas/sociais/habitacao/9221-sintese-de-indicadores-sociais.html). *IBGE* (in Portuguese). Retrieved 2024-12-03.* +12. **[^](#cite_ref-12)** *C.Textor (25 April 2024). ["Share of households owning the housing property they occupy in Hong Kong from 2000 to 2023"](https://www.statista.com/statistics/1463140/hong-kong-share-of-home-ownership/). *Statista*. Retrieved 9 October 2024.* +13. **[^](#cite_ref-13)** *["Statistics Iceland: Many low-income households in central Reykjavík and in Ásbrú"](https://statice.is/publications/news-archive/census/census-2021-households-income-and-car-availability/). *Statistics Iceland*. Retrieved 2024-10-09.* +14. **[^](#cite_ref-14)** +15. **[^](#cite_ref-15)** *["HOUSING CHARACTERISTICS BY RESIDENTIAL STATUS, OWNERSHIP AND NUMBER OF ROOMS BY REGION, PAKISTAN"](https://www.pbs.gov.pk/sites/default/files/population/2023/tables/table_25_national.pdf) (PDF).* +16. **[^](#cite_ref-16)** *["The 20-Year Housing Development Master Plan BE 2560-2579 (2017-2036)"](https://www.ohchr.org/sites/default/files/documents/issues/housing/cfi-housing-affordability/submission-srhousing-cfi-housing-affordability-states-Thailand.doc). *www.ohchr.org*. 2021. Retrieved 9 October 2024.* +17. **[^](#cite_ref-17)** *["Homeownership Rate"](https://www.ibisworld.com/us/bed/homeownership-rate/4623/). *www.ibisworld.com*. 30 September 2024. Retrieved 9 October 2024.* +| Part of a series on | +| --- | +| Housing | +| | +| Types* House (detached * semi-detached * terraced) * Apartment * Bungalow * Cottage * Ecohouse * Executive * Green home * Human outpost * I-house * Ranch * Tenement * Luxury * Mixed-use development * Hotel * Hostel * Castle * Flophouse * Shack * Slum * Shanty town * Villa **Ownership*** Community land trust * Condominium * Cooperative * Informal * Owner-occupancy * Public housing * Squat | +| Issues* Environmental + Design + Planning + Racism + Security * Healthiness * Crisis + Affordability - By country - Index + Subsidized - Home mortgage interest deduction * Inequality * Housing discrimination + Redlining * Development + Building code + Economics + Permit + Planning - Participatory - Conflict + Impact fee + NIMBY + YIMBY + Zoning * Rent + Control + Regulation + Strike + Tenants union * Investing + Appraisal + Bubble + Price index + Subprime lending * Sustainable + Architecture + Development + Living + City Homelessness* Eviction + Just cause + Tenant right to counsel * Housing First * Stress * Rapid Re-Housing * Right to housing | +| Other* 15-minute city * Alternative lifestyle * Assisted living * Boomtown * Cottage homes * Eco-cities * Ecovillage * Foster care * Green building * Group home * Halfway house * Healthy community design * Homeless shelter * Hospital * Local community * Log house * Natural building * Nursing home * Orphanage * Prison * Psychiatric hospital * Residential care * Residential treatment center * Retirement community * Retirement home * Supportive housing * Supported living **Housing portal** | +| * v (View this template) * t (Discuss this template) * e (Edit this template) | + +**L'occupation par le propriétaire** ou **l'accession à la propriété** est une forme d'occupation du logement dans laquelle une personne, appelée **propriétaire occupant**, **propriétaire occupant** ou **propriétaire de la maison**, est propriétaire de la maison dans laquelle elle vit. La maison peut être une maison, comme une maison unifamiliale, un appartement, une copropriété ou une coopérative d'habitation. En plus de fournir un logement, la propriété fonctionne également comme un investissement immobilier. + +## Acquisition + +Certaines maisons sont construites par les propriétaires avec l'intention d'être occupées. Beaucoup sont hérités. Un grand nombre sont achetés comme maisons neuves auprès d'un promoteur immobilier ou comme maisons existantes auprès d'un ancien propriétaire ou propriétaire occupant. + +Une maison est généralement l’achat le plus cher qu’un individu ou une famille effectue et coûte souvent plusieurs fois le revenu annuel du ménage. Compte tenu du coût élevé, la plupart des particuliers ne disposent pas de suffisamment d’économies pour payer la totalité du montant. Dans les pays développés, des prêts hypothécaires sont disponibles auprès des institutions financières en échange d'intérêts. Si le propriétaire ne respecte pas le calendrier de remboursement convenu, une saisie (appelée reprise de possession dans certains pays) peut en résulter. + +De nombreux pays proposent une aide aux futurs acheteurs pour qu’ils puissent effectuer leurs achats. Ces mesures comprennent des subventions, des prêts hypothécaires subventionnés et des garanties hypothécaires. Les acheteurs potentiels peuvent devoir remplir certaines conditions de ressources pour avoir droit à une aide gouvernementale, comme être un premier acheteur ou avoir un revenu inférieur à un certain seuil. + +## Avantages et inconvénients + +Les perspectives concernant les avantages et les risques de la propriété ne sont pas universellement acceptées et dépendent des circonstances et des motivations individuelles. + +L'accession à la propriété donne aux occupants le droit de modifier le bâtiment et le terrain à leur guise (sous réserve des restrictions du gouvernement, de l'association des propriétaires et des actes), les protège de l'expulsion et crée un droit d'occupation qui peut être hérité. Les propriétés transmises peuvent être louées (comme dans le cas d'un propriétaire intentionnel ou accidentel) ou vendues dans le cadre d'une succession « Succession (loi) »). Dans certaines juridictions, cela confère également certains droits légaux en ce qui concerne les butoirs. + +Les maisons et les terrains sur lesquels elles sont situées sont souvent chers, et la combinaison des mensualités hypothécaires, des assurances, de l'entretien et des réparations, ainsi que des taxes foncières est parfois supérieure aux coûts de location mensuels. Les bâtiments peuvent également gagner ou perdre de la valeur substantielle en raison des fluctuations du marché immobilier, et la vente d'une propriété peut prendre beaucoup de temps, en fonction des conditions du marché. Cela peut rendre l'accession à la propriété plus contraignante si le propriétaire envisage de déménager "Déménagement (personnel)") à une date ultérieure. Certains propriétaires considèrent leur achat comme un [investissement](/w/index.php?title=Commodification_of_housing&action=edit&redlink=1 "Marchandisation du logement (la page n'existe pas)") et ont l'intention de vendre ou de louer la propriété après avoir rénové ou laissé la maison prendre de la valeur (on parle de retournement si cela est fait rapidement). En 2024, la valeur nette médiane du propriétaire était d'environ 400 000 $ et la valeur nette médiane du locataire était de 10 400 $. + +La location peut être plus avantageuse que l'occupation par le propriétaire lorsque le locataire a besoin de flexibilité pour déménager là où se trouvent les opportunités de travail. Lorsqu’une situation de travail à long terme est réglée, le locataire peut alors réévaluer les coûts de location et d’accession à la propriété. + +Traditionnellement, l'accession à la propriété a été encouragée par les gouvernements des pays occidentaux (en particulier les pays anglophones) car c'était un moyen pour les gens d'acquérir une richesse générationnelle dans le cadre de la [marchandisation du logement](/w/index.php?title=Commodification_of_housing&action=edit&redlink=1 "Marchandisation du logement (la page n'existe pas)"), on pensait qu'elle encourageait l'épargne et qu'elle favorisait l'engagement civique. Cependant, le krach du marché immobilier lors de la crise financière de 2008 dans la plupart des pays anglophones a amené les universitaires et les décideurs politiques à remettre en question cette logique. + +## Influence politique + +Posséder une maison influence la façon dont un individu perçoit le rôle du gouvernement. Les données des pays de l’OCDE montrent que lorsque les prix de l’immobilier augmentent, les individus se montrent plus critiques à l’égard de l’État-providence. À l’inverse, lorsque les prix de l’immobilier baissent, les propriétaires sont plus susceptibles de favoriser l’intervention du gouvernement. Aux États-Unis, les régions où les taux d’accession à la propriété sont élevés ont des taux de participation électorale plus élevés. Il existe également une relation faible entre l’accession à la propriété et le soutien aux candidats républicains. Les données provenant du Royaume-Uni confortent l’idée selon laquelle les propriétaires considèrent la valeur de leur maison comme une sorte de police d’assurance privée et informelle contre les chocs économiques. Une maison suffisamment précieuse protège le propriétaire sans nécessiter l’intervention du gouvernement. + +Les propriétaires sont généralement tenus de payer périodiquement la taxe foncière (ou la taxe kilométrique). La taxe est perçue par l'autorité compétente de la juridiction dans laquelle se trouve la propriété ; elle peut être versée à un gouvernement national, à un État fédéré, à un comté ou une région géographique, ou encore à une municipalité. Plusieurs juridictions peuvent imposer la même propriété. Dans la plupart des provinces canadiennes, les acheteurs de maison doivent payer une taxe unique appelée taxe sur les transferts de propriété (taxes de cession immobilière) qui est basée sur le coût de la maison. + +## Statistiques internationales + +Le taux d'accession à la propriété est le rapport entre les unités occupées par leur propriétaire et le nombre total d'unités résidentielles dans une zone spécifiée.*[meilleure source nécessaire*] + + +*The median age of US homebuyers has increased in recent decades, for both first-time buyers (+9 years since 1981) and repeat buyers (+25 years), and all buyers overall (+26 years).[[8]](#cite_note-NARre2024-8)* + + +*Percentage of owner-occupied units in urban areas, by country* + +| Country | % Owner-Occupied Units in Urban Areas*[[9]](#cite_note-:0-9)* | Urban Population, % of Total*[[9]](#cite_note-:0-9)* | Home ownership rate | | +| --- | --- | --- | --- | --- | +| | | | % | Year | +| Albania | — | — | 95.3 | 2023 | +| Argentina | 67% | 92% | 68.9 | 2017 | +| Armenia | 96% | 64% | — | — | +| Australia | 68% | 89% | 66.3 | 2020 | +| Austria | — | — | 54.3 | 2023 | +| Azerbaijan | 71% | 52% | — | — | +| Belgium | — | — | 71.9 | 2023 | +| Bosnia and Herzegovina | — | — | 91.2 | 2007 | +| Brazil | 74% | 87% | 70.8 | 2022 | +| Brunei | — | — | 65.0 | 2019 | +| Bulgaria | 87% | 73% | 86.1 | 2023 | +| Canada | 68% | 81% | 66.5 | 2021 | +| Chile | 69% | 89% | — | — | +| China | 89% | 45% | 96.0 | 2022 | +| Colombia | 50% | 75% | — | — | +| Costa Rica | 75% | 94% | — | — | +| Croatia | — | — | 91.2 | 2023 | +| Cuba | — | — | 90.0 | 2014 | +| Cyprus | — | — | 68.8 | 2023 | +| Czech Republic | 47% | 74% | 76.0 | 2023 | +| Denmark | 54% | 87% | 60.0 | 2023 | +| East Timor | — | — | 49.9 | 2007 | +| Egypt | 37% | 43% | 76.0 | 2019 | +| Estonia | — | — | 80.7 | 2023 | +| European Union | — | — | 69.2 | 2023 | +| Finland | — | — | 69.2 | 2023 | +| France | 47% | 78% | 63.1 | 2023 | +| Germany | 43% | 74% | 47.6 | 2023 | +| Greece | — | — | 69.6 | 2023 | +| Haiti | 60% | 48% | — | — | +| Hong Kong | 53% | 100% | 50.4 | 2023 | +| Hungary | 93% | 68% | 90.5 | 2023 | +| Iceland | — | — | 75.0 | 2021 | +| India | 87% | 30% | 86.6 | 2011 | +| Indonesia | 67% | 54% | 84.0 | 2019 | +| Iran | — | — | 60.5 | 2017 | +| Ireland | — | — | 69.4 | 2023 | +| Israel | — | — | 64.6 | 2019 | +| Italy | 80% | 68% | 75.9 | 2024 | +| Japan | — | — | 55.0 | 2021 | +| Kazakhstan | 96% | — | 98.0 | 2024 | +| Kenya | — | 22% | 75.0 | 2019 | +| Laos | — | — | 95.9 | 2015 | +| Latvia | — | — | 82.8 | 2023 | +| Lithuania | — | — | 88.8 | 2023 | +| Luxembourg | — | — | 67.6 | 2023 | +| Malawi | — | 19% | — | — | +| Malaysia | — | 72% | 76.9 | 2019 | +| Malta | — | — | 74.7 | 2023 | +| Mexico | 71% | 78% | 80.0 | 2009 | +| Mongolia | — | 58% | — | — | +| Montenegro | — | — | 91.0 | 2023 | +| Morocco | 62% | 57% | — | — | +| Myanmar | — | — | 85.5 | 2014 | +| Namibia | 69% | 35% | — | — | +| Nepal | — | — | 86.0 | 2021 | +| Netherlands | 59% | 83% | 70.2 | 2023 | +| New Zealand | 67% | 87% | 64.5 | 2018 | +| Nigeria | 10% | 50% | 25.0 | 2019 | +| North Macedonia | — | — | 85.8 | 2023 | +| Norway | 77% | 78% | 79.2 | 2023 | +| Oman | — | — | 83.0 | 2014 | +| Pakistan | — | 38% | 82.0 | 2023 | +| Panama | 66% | 75% | — | — | +| Peru | — | 72% | — | — | +| Philippines | 80% | 66% | — | — | +| Poland | 78% | 61% | 87.3 | 2023 | +| Portugal | — | — | 76.0 | 2023 | +| Romania | 97% | 54% | 95.6 | 2023 | +| Russia | 81% | 73% | 92.6 | 2023 | +| Rwanda | — | 19% | — | — | +| Saudi Arabia | — | — | 62.1 | 2019 | +| Senegal | — | 43% | — | — | +| Serbia | — | — | 91.6 | 2023 | +| Singapore | 87% | 100% | 87.9 | 2020 | +| Slovakia | — | — | 93.6 | 2023 | +| Slovenia | — | — | 75.2 | 2023 | +| South Africa | 62% | 62% | 69.7 | 2021 | +| South Korea | 56% | 82% | 57.3 | 2021 | +| Spain | 85% | 77% | 75.3 | 2023 | +| Sri Lanka | 82% | 15% | — | — | +| Sweden | 41% | 85% | 64.9 | 2023 | +| Switzerland | 40% | 74% | 42.3 | 2023 | +| Taiwan | — | — | 83.9 | 2010 | +| Tanzania | — | 26% | — | — | +| Thailand | 75% | 34% | 74.0 | 2021 | +| Trinidad and Tobago | — | — | 76.0 | 2013 | +| Tunisia | 78% | 67% | — | — | +| Turkey | 81% | 70% | 56.7 | 2023 | +| Uganda | — | 13% | — | — | +| Ukraine | — | 68% | — | — | +| United Arab Emirates | — | — | 28.0 | 2017 | +| United Kingdom | 50% | 90% | 65.2 | 2023 | +| United States | 65% | 82% | 65.7 | 2024 | +| Uruguay | 59% | 93% | — | — | +| Venezuela | 83% | 94% | — | — | +| Vietnam | — | 28% | 90.0 | 2020 | +| Zimbabwe | — | 38% | — | — | + +## Voir aussi + +* Accession à la propriété en Australie + +* L'accession à la propriété en Allemagne + +* L'accession à la propriété aux États-Unis + +* Loyer imputé + +* Fonds propres négatifs + +* Propriété + +* Propriété + diff --git a/fastapi/app/ai/synthesis.py b/fastapi/app/ai/synthesis.py index 0e4db01..a35d2c9 100644 --- a/fastapi/app/ai/synthesis.py +++ b/fastapi/app/ai/synthesis.py @@ -14,10 +14,11 @@ class TextFragment: text: str missing_info: List[str] = field(default_factory=list) + translation: str = field(default_factory=str) @dataclass class ArticleModel: - titles: list[str] = field(default_factory=list) + titles: list[TextFragment] = field(default_factory=list) text: list[TextFragment] = field(default_factory=list) media: list[str] = field(default_factory=list) tabular: list[str] = field(default_factory=list) @@ -25,10 +26,24 @@ class ArticleModel: structure: List[Tuple[int, int]] = field(default_factory=list) + def get_section(self, _type: int, index: int): + if _type == TITLE: + return self.titles[index] + elif _type == TEXT: + return self.text[index] + elif _type == MEDIA: + return self.media[index] + elif _type == TABULAR: + return self.tabular[index] + elif _type == REFERENCES: + return self.references[index] + else: + return None + def add_to_section(self, section_type: int, section_content: str): idx = 0 if section_type == 0: - self.titles.append(section_content) + self.titles.append(TextFragment(text=section_content)) idx = len(self.titles)-1 elif section_type == 1: self.text.append(TextFragment(text=section_content)) @@ -44,6 +59,9 @@ def add_to_section(self, section_type: int, section_content: str): idx = len(self.references)-1 self.structure.append((section_type, idx)) + # def synthesize(self): + # pass + def html_to_md(page_name): params = { "action": "parse", @@ -54,13 +72,26 @@ def html_to_md(page_name): "redirects": 1 } - r = requests.get("https://en.wikipedia.org/w/api.php", params=params, headers={"User-Agent": "YourAppName/1.0 (contact@example.com)"}, timeout=30) - html = r.json()["parse"]["text"] - markdown = convert_to_markdown(html) - - return markdown - -def create_article_model(md_content: str) -> ArticleModel: + try: + r = requests.get("https://en.wikipedia.org/w/api.php", params=params, headers={"User-Agent": "YourAppName/1.0 (contact@example.com)"}, timeout=30) + r.raise_for_status() # Raise an exception for bad status codes + + json_data = r.json() + if "parse" not in json_data: + raise ValueError(f"No 'parse' key in API response for page '{page_name}'. Response: {json_data}") + + html = json_data["parse"]["text"] + markdown = convert_to_markdown(html) + return markdown + + except requests.exceptions.RequestException as e: + raise Exception(f"Failed to fetch Wikipedia page '{page_name}': {e}") + except ValueError as e: + raise Exception(f"Invalid response from Wikipedia API for page '{page_name}': {e}") + except KeyError as e: + raise Exception(f"Missing expected key in Wikipedia API response for page '{page_name}': {e}") + +def create_article_model_from_md(md_content: str) -> ArticleModel: def html_image_to_markdown(html_img): src_match = re.search(r'src=["\']([^"\']+)["\']', html_img) alt_match = re.search(r'alt=["\']([^"\']*)["\']', html_img) @@ -178,15 +209,16 @@ def parse_image(start_idx): model.add_to_section(TEXT, article_lines[line_idx]) line_idx += 1 - for x in model.tabular: - print("-"*50) - print(x) + # for x in model.tabular: + # print("-"*50) + # print(x) return model +def create_article_model(article_title): + md = html_to_md(article_title) + return create_article_model_from_md(md) + + -# article_titles = ["Pet door", "Owner-occupancy"] -# -# md = html_to_md(article_titles[1]) -# model = create_article_model(md) diff --git a/fastapi/app/ai/synthesis_demo.py b/fastapi/app/ai/synthesis_demo.py new file mode 100644 index 0000000..34c05df --- /dev/null +++ b/fastapi/app/ai/synthesis_demo.py @@ -0,0 +1,41 @@ +from synthesis import * +from deep_translator import GoogleTranslator +from tqdm import tqdm + +test_article_titles = ["Pet door", "Owner-occupancy"] + +model = create_article_model(test_article_titles[1]) + +for fragment in tqdm(model.text): + translated = GoogleTranslator( + source="en", + target="fr" + ).translate(fragment.text) + fragment.translation = translated + +for heading in tqdm(model.titles): + translated = GoogleTranslator( + source="en", + target="fr" + ).translate(heading.text) + heading.translation = translated + + +target_file_name = "SYNTH_DEMO.md" +synthesized_md_text = "" + +for section_type, idx in model.structure: + if section_type == TITLE: + synthesized_md_text += f"## {model.titles[idx].translation}\n\n" + elif section_type == TEXT: + synthesized_md_text += f"{model.text[idx].translation}\n\n" + elif section_type == MEDIA: + synthesized_md_text += model.media[idx] + "\n" + elif section_type == TABULAR: + synthesized_md_text += model.tabular[idx] + "\n" + elif section_type == REFERENCES: + synthesized_md_text += model.references[idx] + "\n" + +with open(target_file_name, 'w') as file: + file.write(synthesized_md_text) +print("done.") diff --git a/fastapi/app/main.py b/fastapi/app/main.py index da9c342..3664e77 100644 --- a/fastapi/app/main.py +++ b/fastapi/app/main.py @@ -209,39 +209,42 @@ def compare_articles(text_a: str, text_b: str, similarity_threshold: float = 0.7 return output @app.get("/synthesis/full", response_model=ArticleResponse) -def synthesize_full_article(target_language: str, article_a: str, article_b: str, article_synth_base: int): +def synthesize_full_article(target_language: str, article_title_a: str, article_title_b: str, article_synth_base: int): + # article_synth_base indicates which article model will be used as the base for the newly synthesized article + if article_synth_base < 0 or article_synth_base >= 2: raise HTTPException(status_code=400, detail="article_synth_base must be 0 or 1") # todo: check if language target is supported - deconstruct_a = deconstruct_article(article_a) - deconstruct_b = deconstruct_article(article_b) + model_a = create_article_model(article_title_a) + model_b = create_article_model(article_title_b) - target_base = deconstruct_a if article_synth_base == 0 else deconstruct_b - comp_base = deconstruct_b if article_synth_base == 0 else deconstruct_a + target_base = model_a if article_synth_base == 0 else model_b + comp_base = model_b if article_synth_base == 0 else model_a missing = {} extra = {} - synthesis = [] - # todo: this full comparison code should be in the sem. comparison (underlying function) endpoint also - for id, text_a in enumerate(target_base.text): - for text_b in comp_base.text: # use multiprocessing - output = llm_semantic_comparison(text_a, text_b) - missing[id] = output['missing_info'] - extra[id] = output['extra_info'] - - for id, text in enumerate(target_base.text): - # synthesize paragraph - para_synth = synthesize_paragraph(text, missing[id]) - # add to full synthesis - synthesis.append(para_synth) - - synthesis.extend(target_base.media) - synthesis.extend(target_base.tabular) - - return synthesis + for fragment_a in enumerate(target_base.text): + for fragment_b in comp_base.text: + output = llm_semantic_comparison(fragment_a.text, fragment_b.text) + fragment_a.missing_info = output['missing_info'] + # missing[id] = output['missing_info'] + # extra[id] = output['extra_info'] + + symmetrical_article_text = "" + for pack in target_base.structure: + section_type, index = pack + + section = target_base.get_section(section_type, index) + if section_type == TEXT: + + # full_text = section.text + + else: + + + # return synthesis if __name__ == '__main__': # Defines API URL (host, port) diff --git a/fastapi/app/test/__init__.py b/fastapi/app/tests/__init__.py similarity index 100% rename from fastapi/app/test/__init__.py rename to fastapi/app/tests/__init__.py diff --git a/fastapi/app/test/test_main.py b/fastapi/app/tests/test_main.py similarity index 100% rename from fastapi/app/test/test_main.py rename to fastapi/app/tests/test_main.py