-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcheck_intersection_of_query_and_para_necessary.py
383 lines (308 loc) · 18 KB
/
check_intersection_of_query_and_para_necessary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
"""Compare the parameter intersections between the `all_shortcuts_paras_that_is_necessary_in_query.json` generated by `generate_shortcut_desc.py`
and the `generated_success_queries.json` produced by `generate_queries.py`.
This comparison is necessary because user queries, designed to be natural, might not include all the parameters essential for constructing shortcuts.
I need to use LLM to identify these essential parameters and ensure they are correctly included in our evaluation.
"""
import os
import json
import openai
import tiktoken
import time
from generate_shortcut_desc import WFActionsClass, APIsClass
from generate_shortcut_desc import get_all_shortcuts_paras_that_is_necessary_in_query
from cal_shortcut_len import cal_WFWorkflowActions_len
client = openai.OpenAI()
input_token_count, output_token_count = 0, 0
model_name = "gpt-3.5-turbo"
input_price_every_million, output_price_every_million = 0.5, 1.5
# model_name = "gpt-4o"
# input_price_every_million, output_price_every_million = 5, 15
SYSTEM_PROMPT_TEMPLATE = """Your task is to classify the parameters I provide based on user queries, API information, and API calls (also known as actions).
User query describes the task the user wants to accomplish.
Information about the API definition includes the API name, parameter names, parameter types, default values, return value names, and return value types. Parameters are identified by 'Parameters' and explained. The return value names and return value types are identified by 'Return Values'. The API's brief and detailed descriptions are marked by 'Description'. The natural language description of the API is marked by 'ParameterSummary'.
Completing the user query requires a series of API calls, each API call needs the correct and appropriate parameters.
We have pre-selected possible parameters that may appear in the query.
Please note, you must classify these pre-selected parameters based on the user query. Each parameter can generally be classified into the following categories:
1. Precise parameter: Parameters stated by users in the query, or those implicitly indicated in the query but can be accurately inferred by combining the query and the API definition.
2. Not precise parameter: Parameters not stated by users in the query and cannot be accurately inferred even with the combination of the query and the API definition.
Note! Note! Note! all precise parameters must be clearly or implicitly specified in the query.
"""
USER_PROMPT_TEMPLATE = """The user query is: {query}
Information about the API definition is provided below:
{api_desc}
The API call is:
{API_call}
The pre-selected possible parameters that may appear in the query are listed below:
{possible_paras}
Output the classification in the following format:
{{
para_name1: {{
para_name1: para_type1 # para_type must be in ['Essential parameter', 'Not essential parameter'],
"reason1": The reason you classify para_name1 into this category,
}},
para_name2: {{
para_name2: para_type2,
"reason2": The reason you classify para_name1 into this category
}},
...
}}
Do not output any additional content; only output a JSON. Do not enclose your output with ```json XXX```.
Note! Note! Note! all precise parameters must be clearly or implicitly specified in the query.
Begin!
"""
pre_compute_cost = False
input_pre_cal_count = 0
input_para_sum = 0
input_shortcut_num = 0
input_action_num = 0
input_token_count = 0
output_token_count = 0
if __name__ == "__main__":
SHORTCUT_DATA = os.getenv("SHORTCUT_DATA")
final_detailed_records_path = os.path.join(SHORTCUT_DATA, "1_final_detailed_records_filter_apis_leq_30.json")
with open(final_detailed_records_path, "r") as rp:
final_detailed_records = json.load(rp)
# Sort the shortcuts by length in ascending order.
# final_detailed_records.sort(key=lambda x: cal_WFWorkflowActions_len(x["shortcut"]["WFWorkflowActions"], URL=x["URL"]), reverse=True)
new_final_detailed_records = {}
for cur_detailed_record in final_detailed_records:
URL = cur_detailed_record["URL"]
new_final_detailed_records[URL] = cur_detailed_record
final_detailed_records = new_final_detailed_records
del new_final_detailed_records
"""Definition file from the Shortcuts App"""
WFActions_path = os.path.join(SHORTCUT_DATA, "is.workflow.actions", "WorkflowKit.framework/Versions/A/Resources/WFActions.json")
wf_actions_instance = WFActionsClass(WFActions_path)
my_WFActions_path = os.path.join(SHORTCUT_DATA, "is.workflow.actions", "my_WFActions.json")
wf_actions_instance.WFActions_dicts.update(json.load(open(my_WFActions_path, "r")))
all_api2info_WF, all_api2paraname2paratype_WF, _ = wf_actions_instance.all_api2desc(need_api2paraname2paratype=True, need_api2parasummary=True)
"""Load the definition file from the App."""
# fail_api_json_path = os.path.join(SHORTCUT_DATA, "4_fail_api_json_filter.json")
# succ_api_json_path = os.path.join(SHORTCUT_DATA, "4_success_api_json_filter.json")
# API_instance = APIsClass(succ_api_json_path, fail_api_json_path)
api_json_path = os.path.join(SHORTCUT_DATA, "4_api_json_filter.json")
API_instance = APIsClass(api_json_path)
all_api2info_from_app, all_api2paraname2paratype_from_app, _ = API_instance.all_api2desc(need_api2paraname2paratype=True, need_api2parasummary=True)
"""Mapping of all APIs to API info"""
all_api2info = all_api2info_WF.copy()
all_api2info.update(all_api2info_from_app)
# print(len(all_api2info_WF), len(all_api2info_from_app), len(all_api2info))
"""Mapping of all APIs to parameter names to their types (including default values)"""
all_api2paraname2paratype = all_api2paraname2paratype_WF.copy()
all_api2paraname2paratype.update(all_api2paraname2paratype_from_app)
""""Mapping of shortcuts to action positions and parameter names"""
all_shortcuts_paras_that_is_necessary_in_query = get_all_shortcuts_paras_that_is_necessary_in_query(
list(final_detailed_records.values()), all_api2paraname2paratype)
generated_success_queries_path = os.path.join(SHORTCUT_DATA, "generated_success_queries.json")
with open(generated_success_queries_path, "r") as f:
generated_success_queries = json.load(f)
print(f"Number of queries generated:{len(generated_success_queries)}")
"""Identify all meaningful parameters in `generated_success_query` based on `all_api2info`,
`all_shortcuts_paras_that_is_necessary_in_query`, and `generated_success_queries`.
"""
already_significant_shortcuts = []
already_significant_shortcuts_path = os.path.join(SHORTCUT_DATA, f"{model_name}_check_intersection_of_query_and_para_necessary.json")
if os.path.exists(already_significant_shortcuts_path):
with open(already_significant_shortcuts_path, "r") as f:
already_significant_shortcuts = [json.loads(line) for line in f.readlines()]
already_significant_url_names = set([cur_significant_para["URL"] for cur_significant_para in already_significant_shortcuts])
del already_significant_shortcuts
to_be_processed_num = len(generated_success_queries) - len(already_significant_url_names)
print(f"Number of processed shortcuts: {len(already_significant_url_names)},Number of remaining shortcuts: {to_be_processed_num}")
new_significant_shortcuts = []
def num_tokens_from_string(string: str, encoding_name: str) -> int:
encoding = tiktoken.get_encoding(encoding_name)
num_tokens = len(encoding.encode(string))
return num_tokens
# Randomly select 200 entries from `generated_success_queries` and create a new dictionary named `random_200_success_queries`.
# random_200_success_queries = {}
# cnt = 0
# for URL, cur_generated_query in generated_success_queries.items():
# if URL in already_significant_url_names:
# continue
# random_200_success_queries[URL] = cur_generated_query
# cnt += 1
# if cnt == 10:
# break
print("Begin processing new shortcuts...")
cnt = 0
input_cost, output_cost, total_cost = 0, 0, 0
# for URL, cur_generated_query in random_200_success_queries.items():
for URL, cur_generated_query in generated_success_queries.items():
if URL in already_significant_url_names:
continue
GeneratedQuery = cur_generated_query["GeneratedQuery"]
query = GeneratedQuery["query"]
if URL not in all_shortcuts_paras_that_is_necessary_in_query:
continue
cur_necessary_paras = all_shortcuts_paras_that_is_necessary_in_query[URL]
cur_detailed_record = final_detailed_records[URL]
shortcut = cur_detailed_record["shortcut"]
WFWorkflowActions = shortcut["WFWorkflowActions"]
significant_paras = {} # Store the meaningful parameters of the current shortcut.
if pre_compute_cost:
if_have_necessary_para = False
for cur_pos, cur_necessary_para_dict in cur_necessary_paras.items(): # Traversal of action levels, excluding the actions `is.workflow.actions.comment` and `alert`.
cur_int_pos = int(cur_pos)
API_call = WFWorkflowActions[cur_int_pos]
WFWorkflowActionIdentifier = API_call["WFWorkflowActionIdentifier"]
# WFWorkflowActionParameters = API_call["WFWorkflowActionParameters"]
cur_api2info = all_api2info[WFWorkflowActionIdentifier]
cur_necessary_names = list(cur_necessary_para_dict.keys())
cur_necessary_values = list(cur_necessary_para_dict.values())
system_prompt = SYSTEM_PROMPT_TEMPLATE
user_prompt = USER_PROMPT_TEMPLATE.format(
query=query,
# API_call=API_call,
API_call=cur_necessary_para_dict,
api_desc=cur_api2info,
possible_paras=cur_necessary_names
)
pre_token_count = num_tokens_from_string(system_prompt + user_prompt, "cl100k_base")
if pre_compute_cost:
input_pre_cal_count += pre_token_count
cnt += 1
if cnt % 20 == 0:
print(f"Processed {cnt} results.")
input_para_sum += len(cur_necessary_names)
input_action_num += 1
if_have_necessary_para = True
continue
try_times = 5
cur_try_time = 0
cur_input_token_count, cur_output_token_count = 0, 0
cur_input_cost, cur_output_cost, cur_total_cost = 0, 0, 0
while cur_try_time < try_times:
try:
completion = client.chat.completions.create(
model=model_name,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt}
]
)
cur_input_token_count = completion.usage.prompt_tokens
cur_output_token_count = completion.usage.completion_tokens
cur_input_cost = cur_input_token_count / 1000000 * input_price_every_million
cur_output_cost = cur_output_token_count / 1000000 * output_price_every_million
cur_total_cost = cur_input_cost + cur_output_cost
input_token_count += completion.usage.prompt_tokens
output_token_count += completion.usage.completion_tokens
input_cost = input_token_count / 1000000 * input_price_every_million
output_cost = output_token_count / 1000000 * output_price_every_million
total_cost = input_cost + output_cost
break
except Exception as e:
print(f"Fail! Generation Error! generating shortcut {URL} on pos {cur_int_pos}, {WFWorkflowActionIdentifier}")
print(e)
cur_try_time += 1
if cur_try_time == try_times:
print(f"Processed {cnt} results.") # Save the current results.
cost_path = os.path.join(
SHORTCUT_DATA,
f"cost_{model_name}_check_intersection_of_query_and_para_necessary.json")
# with open(cost_path, "w") as f:
# json.dump({
# "input_token_count": input_token_count,
# "output_token_count": output_token_count,
# "input_cost": input_cost,
# "output_cost": output_cost,
# "total_cost": total_cost
# }, f, indent=4, ensure_ascii=False)
res_path = os.path.join(
SHORTCUT_DATA,
f"{model_name}_check_intersection_of_query_and_para_necessary.json")
with open(res_path, "a") as f:
write_str = ""
for res in new_significant_shortcuts:
write_str += json.dumps(res) + "\n"
f.write(write_str)
raise e
generated_content = None
try:
generated_content = completion.choices[0].message.content
generated_content = json.loads(generated_content)
except Exception as e:
print(f"Fail! Generation Error! generating shortcut {URL} on pos {cur_int_pos}, {WFWorkflowActionIdentifier}")
print(e)
continue
# print("query: ", query)
# print("api_desc: ", cur_api2info)
# print("API_call: ", json.dumps(API_call, indent=4, ensure_ascii=False))
# print("necessary_paras: ", cur_necessary_names)
# print(f"Generated content: {json.dumps(generated_content, indent=4, ensure_ascii=False)}")
# input()
significant_paras[str(cur_int_pos)] = generated_content
print(f"Token count: {input_token_count}, {output_token_count}, Input Cost {input_cost}, Output Cost {output_cost}, Total Cost {total_cost}")
time.sleep(0.5)
if pre_compute_cost and if_have_necessary_para:
input_shortcut_num += 1
if not pre_compute_cost:
new_significant_shortcuts.append(
{
"URL": URL,
"query": query,
"api_desc": cur_api2info,
"necessary_paras": cur_necessary_paras,
"significant_paras": significant_paras
})
cnt += 1
print(f"Processed {cnt} results.")
if cnt % 20 == 0:
print(f"Processed {cnt} results.")
cost_path = os.path.join(
SHORTCUT_DATA,
f"cost_{model_name}_check_intersection_of_query_and_para_necessary.json")
# with open(cost_path, "w") as f:
# json.dump({
# "input_token_count": input_token_count,
# "output_token_count": output_token_count,
# "input_cost": input_cost,
# "output_cost": output_cost,
# "total_cost": total_cost
# }, f, indent=4, ensure_ascii=False)
res_path = os.path.join(
SHORTCUT_DATA,
f"{model_name}_check_intersection_of_query_and_para_necessary.json")
with open(res_path, "a") as f:
write_str = ""
for res in new_significant_shortcuts:
write_str += json.dumps(res) + "\n"
f.write(write_str)
new_significant_shortcuts = []
print(f"Saved {cnt} results.")
if not pre_compute_cost:
cost_path = os.path.join(
SHORTCUT_DATA,
f"cost_{model_name}_check_intersection_of_query_and_para_necessary.json")
# with open(cost_path, "w") as f:
# json.dump({
# "input_token_count": input_token_count,
# "output_token_count": output_token_count,
# "input_cost": input_cost,
# "output_cost": output_cost,
# "total_cost": total_cost
# }, f, indent=4, ensure_ascii=False)
res_path = os.path.join(
SHORTCUT_DATA,
f"{model_name}_check_intersection_of_query_and_para_necessary.json")
with open(res_path, "a") as f:
write_str = ""
for res in new_significant_shortcuts:
write_str += json.dumps(res) + "\n"
f.write(write_str)
else:
# Pre-computed token cnt: 22095963, cost 11.0479815
print(f"Pre-computed token cnt: {input_pre_cal_count}, cost {input_pre_cal_count / 1000000 * input_price_every_million}, number of paras {input_para_sum}, input_action_num {input_action_num}, input_shortcut_num {input_shortcut_num}")
def jsonl_to_json(jsonl_file_path, json_file_path):
result_dict = {}
with open(jsonl_file_path, 'r', encoding='utf-8') as jsonl_file:
for line in jsonl_file:
record = json.loads(line)
url = record.pop('URL')
result_dict[url] = record
with open(json_file_path, 'w', encoding='utf-8') as json_file:
json.dump(result_dict, json_file, ensure_ascii=False, indent=4)
src_path = os.path.join(SHORTCUT_DATA, f"{model_name}_check_intersection_of_query_and_para_necessary.json")
dst_path = os.path.join(SHORTCUT_DATA, f"json-{model_name}_check_intersection_of_query_and_para_necessary.json")
jsonl_to_json(src_path, dst_path)