experiments/check_intersection_of_query_and_para_necessary.py

"""Compare the parameter intersections between the `all_shortcuts_paras_that_is_necessary_in_query.json` generated by `generate_shortcut_desc.py` 
and the `generated_success_queries.json` produced by `generate_queries.py`.

This comparison is necessary because user queries, designed to be natural, might not include all the parameters essential for constructing shortcuts. 
I need to use LLM to identify these essential parameters and ensure they are correctly included in our evaluation.
"""

import os
import json
import openai
import tiktoken
import time
from generate_shortcut_desc import WFActionsClass, APIsClass
from generate_shortcut_desc import get_all_shortcuts_paras_that_is_necessary_in_query
from cal_shortcut_len import cal_WFWorkflowActions_len

client = openai.OpenAI()
input_token_count, output_token_count = 0, 0

model_name = "gpt-3.5-turbo"
input_price_every_million, output_price_every_million = 0.5, 1.5
# model_name = "gpt-4o"
# input_price_every_million, output_price_every_million = 5, 15

SYSTEM_PROMPT_TEMPLATE = """Your task is to classify the parameters I provide based on user queries, API information, and API calls (also known as actions).

User query describes the task the user wants to accomplish.

Information about the API definition includes the API name, parameter names, parameter types, default values, return value names, and return value types. Parameters are identified by 'Parameters' and explained. The return value names and return value types are identified by 'Return Values'. The API's brief and detailed descriptions are marked by 'Description'. The natural language description of the API is marked by 'ParameterSummary'.

Completing the user query requires a series of API calls, each API call needs the correct and appropriate parameters. 
We have pre-selected possible parameters that may appear in the query.

Please note, you must classify these pre-selected parameters based on the user query. Each parameter can generally be classified into the following categories:
1. Precise parameter: Parameters stated by users in the query, or those implicitly indicated in the query but can be accurately inferred by combining the query and the API definition.
2. Not precise parameter: Parameters not stated by users in the query and cannot be accurately inferred even with the combination of the query and the API definition.

Note! Note! Note! all precise parameters must be clearly or implicitly specified in the query.
"""


USER_PROMPT_TEMPLATE = """The user query is： {query}

Information about the API definition is provided below:
{api_desc}

The API call is:
{API_call}
The pre-selected possible parameters that may appear in the query are listed below:
{possible_paras}

Output the classification in the following format:
{{
    para_name1: {{
        para_name1: para_type1 # para_type must be in ['Essential parameter', 'Not essential parameter'],
        "reason1": The reason you classify para_name1 into this category,
    }},
    para_name2: {{
        para_name2: para_type2,
        "reason2": The reason you classify para_name1 into this category
    }},
    ...
}}

Do not output any additional content; only output a JSON. Do not enclose your output with ```json XXX```.

Note! Note! Note! all precise parameters must be clearly or implicitly specified in the query.

Begin!
"""

pre_compute_cost = False
input_pre_cal_count = 0
input_para_sum = 0
input_shortcut_num = 0
input_action_num = 0
input_token_count = 0
output_token_count = 0

if __name__ == "__main__":

    SHORTCUT_DATA = os.getenv("SHORTCUT_DATA")
    final_detailed_records_path = os.path.join(SHORTCUT_DATA, "1_final_detailed_records_filter_apis_leq_30.json")
    with open(final_detailed_records_path, "r") as rp:
        final_detailed_records = json.load(rp)
    # Sort the shortcuts by length in ascending order.
    # final_detailed_records.sort(key=lambda x: cal_WFWorkflowActions_len(x["shortcut"]["WFWorkflowActions"], URL=x["URL"]), reverse=True)
    new_final_detailed_records = {}
    for cur_detailed_record in final_detailed_records:
        URL = cur_detailed_record["URL"]
        new_final_detailed_records[URL] = cur_detailed_record
    final_detailed_records = new_final_detailed_records
    del new_final_detailed_records

    """Definition file from the Shortcuts App"""
    WFActions_path = os.path.join(SHORTCUT_DATA, "is.workflow.actions", "WorkflowKit.framework/Versions/A/Resources/WFActions.json")
    wf_actions_instance = WFActionsClass(WFActions_path)
    my_WFActions_path = os.path.join(SHORTCUT_DATA, "is.workflow.actions", "my_WFActions.json")
    wf_actions_instance.WFActions_dicts.update(json.load(open(my_WFActions_path, "r")))
    all_api2info_WF, all_api2paraname2paratype_WF, _ = wf_actions_instance.all_api2desc(need_api2paraname2paratype=True, need_api2parasummary=True)
    """Load the definition file from the App."""
    # fail_api_json_path = os.path.join(SHORTCUT_DATA, "4_fail_api_json_filter.json")
    # succ_api_json_path = os.path.join(SHORTCUT_DATA, "4_success_api_json_filter.json")
    # API_instance = APIsClass(succ_api_json_path, fail_api_json_path)
    api_json_path = os.path.join(SHORTCUT_DATA, "4_api_json_filter.json")
    API_instance = APIsClass(api_json_path)
    all_api2info_from_app, all_api2paraname2paratype_from_app, _ = API_instance.all_api2desc(need_api2paraname2paratype=True, need_api2parasummary=True)
    """Mapping of all APIs to API info"""
    all_api2info = all_api2info_WF.copy()
    all_api2info.update(all_api2info_from_app)
    # print(len(all_api2info_WF), len(all_api2info_from_app), len(all_api2info))

    """Mapping of all APIs to parameter names to their types (including default values)"""
    all_api2paraname2paratype = all_api2paraname2paratype_WF.copy()
    all_api2paraname2paratype.update(all_api2paraname2paratype_from_app)
    """"Mapping of shortcuts to action positions and parameter names"""
    all_shortcuts_paras_that_is_necessary_in_query = get_all_shortcuts_paras_that_is_necessary_in_query(
        list(final_detailed_records.values()), all_api2paraname2paratype)

    generated_success_queries_path = os.path.join(SHORTCUT_DATA, "generated_success_queries.json")
    with open(generated_success_queries_path, "r") as f:
        generated_success_queries = json.load(f)
    print(f"Number of queries generated：{len(generated_success_queries)}")

    """Identify all meaningful parameters in `generated_success_query` based on `all_api2info`, 
    `all_shortcuts_paras_that_is_necessary_in_query`, and `generated_success_queries`.
    """
    already_significant_shortcuts = []
    already_significant_shortcuts_path = os.path.join(SHORTCUT_DATA, f"{model_name}_check_intersection_of_query_and_para_necessary.json")
    if os.path.exists(already_significant_shortcuts_path):
        with open(already_significant_shortcuts_path, "r") as f:
            already_significant_shortcuts = [json.loads(line) for line in f.readlines()]
    already_significant_url_names = set([cur_significant_para["URL"] for cur_significant_para in already_significant_shortcuts])
    del already_significant_shortcuts
    to_be_processed_num = len(generated_success_queries) - len(already_significant_url_names)
    print(f"Number of processed shortcuts: {len(already_significant_url_names)}，Number of remaining shortcuts: {to_be_processed_num}")
    new_significant_shortcuts = []

    def num_tokens_from_string(string: str, encoding_name: str) -> int:
        encoding = tiktoken.get_encoding(encoding_name)
        num_tokens = len(encoding.encode(string))
        return num_tokens

    # Randomly select 200 entries from `generated_success_queries` and create a new dictionary named `random_200_success_queries`.
    # random_200_success_queries = {}
    # cnt = 0
    # for URL, cur_generated_query in generated_success_queries.items():
    #     if URL in already_significant_url_names:
    #         continue
    #     random_200_success_queries[URL] = cur_generated_query
    #     cnt += 1
    #     if cnt == 10:
    #         break

    print("Begin processing new shortcuts...")
    cnt = 0
    input_cost, output_cost, total_cost = 0, 0, 0
    # for URL, cur_generated_query in random_200_success_queries.items():
    for URL, cur_generated_query in generated_success_queries.items():
        
        if URL in already_significant_url_names:
            continue

        GeneratedQuery = cur_generated_query["GeneratedQuery"]
        query = GeneratedQuery["query"]
        
        if URL not in all_shortcuts_paras_that_is_necessary_in_query:
            continue

        cur_necessary_paras = all_shortcuts_paras_that_is_necessary_in_query[URL]

        cur_detailed_record = final_detailed_records[URL]
        shortcut = cur_detailed_record["shortcut"]
        WFWorkflowActions = shortcut["WFWorkflowActions"]

        significant_paras = {} # Store the meaningful parameters of the current shortcut.

        if pre_compute_cost:
            if_have_necessary_para = False

        for cur_pos, cur_necessary_para_dict in cur_necessary_paras.items(): # Traversal of action levels, excluding the actions `is.workflow.actions.comment` and `alert`.
            
            cur_int_pos = int(cur_pos)
            API_call = WFWorkflowActions[cur_int_pos]
            WFWorkflowActionIdentifier = API_call["WFWorkflowActionIdentifier"]
            # WFWorkflowActionParameters = API_call["WFWorkflowActionParameters"]
            
            cur_api2info = all_api2info[WFWorkflowActionIdentifier]
            
            cur_necessary_names = list(cur_necessary_para_dict.keys())
            cur_necessary_values = list(cur_necessary_para_dict.values())
            
            system_prompt = SYSTEM_PROMPT_TEMPLATE
            user_prompt = USER_PROMPT_TEMPLATE.format(
                query=query, 
                # API_call=API_call, 
                API_call=cur_necessary_para_dict,
                api_desc=cur_api2info, 
                possible_paras=cur_necessary_names
                )
            
            pre_token_count = num_tokens_from_string(system_prompt + user_prompt, "cl100k_base")
            
            if pre_compute_cost:
                input_pre_cal_count += pre_token_count
                cnt += 1
                if cnt % 20 == 0:
                    print(f"Processed {cnt} results.")
                input_para_sum += len(cur_necessary_names)
                input_action_num += 1
                if_have_necessary_para = True
                continue

            try_times = 5
            cur_try_time = 0
            cur_input_token_count, cur_output_token_count = 0, 0
            cur_input_cost, cur_output_cost, cur_total_cost = 0, 0, 0

            while cur_try_time < try_times:

                try:
                    completion = client.chat.completions.create(
                            model=model_name,
                            messages=[
                                {"role": "system", "content": system_prompt},
                                {"role": "user", "content": user_prompt}
                            ]
                        )
                    cur_input_token_count = completion.usage.prompt_tokens
                    cur_output_token_count = completion.usage.completion_tokens
                    cur_input_cost = cur_input_token_count / 1000000 * input_price_every_million
                    cur_output_cost = cur_output_token_count / 1000000 * output_price_every_million
                    cur_total_cost = cur_input_cost + cur_output_cost

                    input_token_count += completion.usage.prompt_tokens
                    output_token_count += completion.usage.completion_tokens
                    input_cost = input_token_count / 1000000 * input_price_every_million
                    output_cost = output_token_count / 1000000 * output_price_every_million
                    total_cost = input_cost + output_cost

                    break

                except Exception as e:
                    print(f"Fail! Generation Error! generating shortcut {URL} on pos {cur_int_pos}, {WFWorkflowActionIdentifier}")
                    print(e)
                    cur_try_time += 1

                    if cur_try_time == try_times:
                        print(f"Processed {cnt} results.") # Save the current results.

                        cost_path = os.path.join(
                            SHORTCUT_DATA, 
                            f"cost_{model_name}_check_intersection_of_query_and_para_necessary.json")
                        # with open(cost_path, "w") as f:
                        #     json.dump({
                        #         "input_token_count": input_token_count, 
                        #         "output_token_count": output_token_count, 
                        #         "input_cost": input_cost, 
                        #         "output_cost": output_cost, 
                        #         "total_cost": total_cost
                        #         }, f, indent=4, ensure_ascii=False)
                        
                        res_path = os.path.join(
                            SHORTCUT_DATA, 
                            f"{model_name}_check_intersection_of_query_and_para_necessary.json")
                        with open(res_path, "a") as f:
                            write_str = ""
                            for res in new_significant_shortcuts:
                                write_str += json.dumps(res) + "\n"
                            f.write(write_str)

                        raise e
            
            generated_content = None
            
            try:
                generated_content = completion.choices[0].message.content
                generated_content = json.loads(generated_content)
            except Exception as e:
                print(f"Fail! Generation Error! generating shortcut {URL} on pos {cur_int_pos}, {WFWorkflowActionIdentifier}")
                print(e)
                continue

            # print("query: ", query)
            # print("api_desc: ", cur_api2info)
            # print("API_call: ", json.dumps(API_call, indent=4, ensure_ascii=False))
            # print("necessary_paras: ", cur_necessary_names)

            # print(f"Generated content: {json.dumps(generated_content, indent=4, ensure_ascii=False)}")
            # input()

            significant_paras[str(cur_int_pos)] = generated_content
            print(f"Token count: {input_token_count}, {output_token_count}, Input Cost {input_cost}, Output Cost {output_cost}, Total Cost {total_cost}")

            time.sleep(0.5)
        
        if pre_compute_cost and if_have_necessary_para:
            input_shortcut_num += 1

        if not pre_compute_cost:

            new_significant_shortcuts.append(
                {
                    "URL": URL, 
                    "query": query,
                    "api_desc": cur_api2info,
                    "necessary_paras": cur_necessary_paras,
                    "significant_paras": significant_paras
                })

            cnt += 1
            print(f"Processed {cnt} results.")
            if cnt % 20 == 0:
                print(f"Processed {cnt} results.")
                cost_path = os.path.join(
                    SHORTCUT_DATA,
                    f"cost_{model_name}_check_intersection_of_query_and_para_necessary.json")
                # with open(cost_path, "w") as f:
                #     json.dump({
                #         "input_token_count": input_token_count, 
                #         "output_token_count": output_token_count, 
                #         "input_cost": input_cost, 
                #         "output_cost": output_cost, 
                #         "total_cost": total_cost
                #         }, f, indent=4, ensure_ascii=False)
                
                res_path = os.path.join(
                    SHORTCUT_DATA,
                    f"{model_name}_check_intersection_of_query_and_para_necessary.json")
                with open(res_path, "a") as f:
                    write_str = ""
                    for res in new_significant_shortcuts:
                        write_str += json.dumps(res) + "\n"
                    f.write(write_str)

                new_significant_shortcuts = []
                print(f"Saved {cnt} results.")
    
    if not pre_compute_cost:

        cost_path = os.path.join(
            SHORTCUT_DATA,
            f"cost_{model_name}_check_intersection_of_query_and_para_necessary.json")
        
        # with open(cost_path, "w") as f:
        #     json.dump({
        #         "input_token_count": input_token_count, 
        #         "output_token_count": output_token_count, 
        #         "input_cost": input_cost, 
        #         "output_cost": output_cost, 
        #         "total_cost": total_cost
        #         }, f, indent=4, ensure_ascii=False)
            
        res_path = os.path.join(
            SHORTCUT_DATA,
            f"{model_name}_check_intersection_of_query_and_para_necessary.json")
        
        with open(res_path, "a") as f:
            write_str = ""
            for res in new_significant_shortcuts:
                write_str += json.dumps(res) + "\n"
            f.write(write_str)
    
    else:
        # Pre-computed token cnt: 22095963, cost 11.0479815
        print(f"Pre-computed token cnt: {input_pre_cal_count}, cost {input_pre_cal_count / 1000000 * input_price_every_million}, number of paras {input_para_sum}, input_action_num {input_action_num}, input_shortcut_num {input_shortcut_num}")

    def jsonl_to_json(jsonl_file_path, json_file_path):
        result_dict = {}
        
        with open(jsonl_file_path, 'r', encoding='utf-8') as jsonl_file:
            for line in jsonl_file:
                record = json.loads(line)
                url = record.pop('URL')
                result_dict[url] = record
        
        with open(json_file_path, 'w', encoding='utf-8') as json_file:
            json.dump(result_dict, json_file, ensure_ascii=False, indent=4)

    src_path = os.path.join(SHORTCUT_DATA, f"{model_name}_check_intersection_of_query_and_para_necessary.json")
    dst_path = os.path.join(SHORTCUT_DATA, f"json-{model_name}_check_intersection_of_query_and_para_necessary.json")
    jsonl_to_json(src_path, dst_path)