You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
readingcode/src/prepare_sft_data.py

103 lines
4.7 KiB

# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import json
MODEL_MAPPING = {
'search': {
"gpt-5": "search-1",
"gpt-5-mini": "search-2",
"Qwen/Qwen3-32B": "search-3"
},
'enhance_reasoning': {
"gpt-5": "reasoner-1",
"gpt-5-mini": "reasoner-2",
"Qwen/Qwen2.5-Coder-32B-Instruct": "reasoner-3"
},
'answer': {
"Qwen/Qwen2.5-Math-72B-Instruct": "answer-math-1",
"Qwen/Qwen2.5-Math-7B-Instruct": "answer-math-2",
"gpt-5": "answer-1",
"gpt-5-mini": "answer-2",
"meta-llama/Llama-3.3-70B-Instruct": "answer-3",
"Qwen/Qwen3-32B": "answer-4"
}
}
task_id = '66f5e796acadd55c11fb11f5'
output_path = f'example.json'
output_dir = 'sft_data'
with open('evaluation/hle.jsonl') as f:
lines = f.readlines()
id2example = {}
for l in lines:
e = json.loads(l)
id2example[e['id']] = e
with open(output_path) as f:
results_data = json.load(f)
problem = id2example[task_id]['question']
messages = [
{"role": "system", "content": "You are good at using tools.\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>\n{\"type\": \"function\", \"function\": {\"name\": \"code_interpreter\", \"description\": \"python executor to execute code and return outputs\", \"parameters\": {\"properties\": {\"code\": {\"description\": \"The code to execute\", \"type\": \"string\"}}, \"required\": [\"code\"], \"title\": \"parameters\", \"type\": \"object\"}}}\n{\"type\": \"function\", \"function\": {\"name\": \"search\", \"description\": \"Search for missing information\", \"parameters\": {\"properties\": {\"query\": {\"description\": \"The query used to search missing information\", \"type\": \"string\"}}, \"required\": [\"query\"], \"title\": \"parameters\", \"type\": \"object\"}}}\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call>"},
{"role": "user", "content": f"Problem: {problem}"}
]
documents = []
for i in range(100):
if not f"turn_{i}_response" in results_data["tool_responses"]:
continue
model_response = results_data["all_tool_calls"][i][0][1]
cur_dict = results_data["tool_responses"][f"turn_{i}_response"][0]
if "search_results_data" in cur_dict:
query = cur_dict['query']
context = ''
for d in cur_dict["search_results_data"]:
if not d in documents:
documents.append(d)
context += d+'\n\n'
tool_call_content = '<tool_call>{"name": "search", "arguments": {"query": "QUERY_TO_REPLACE"}}</tool_call>'
tool_call_content = tool_call_content.replace("QUERY_TO_REPLACE",query)
messages.append({
'role': 'assistant',
'content': model_response+tool_call_content
})
messages.append({
'role': 'user',
'content': "Search results:\n"+context
})
elif "generated_code" in cur_dict and "exec_result" in cur_dict:
tool_call_content = '<tool_call>{"name": "code_interpreter", "arguments": {"query": "CODE_TO_REPLACE"}}</tool_call>'
tool_call_content = tool_call_content.replace("CODE_TO_REPLACE",cur_dict["generated_code"])
messages.append({
'role': 'assistant',
'content': model_response+tool_call_content
})
messages.append({
'role': 'user',
'content': 'Execution results:\n'+cur_dict["exec_result"]
})
elif 'answer_response' in cur_dict:
messages.append({
'role': 'assistant',
'content': model_response+cur_dict['answer_response']
})
data_idx = 0
if not os.path.isdir(output_dir):
os.makedirs(output_dir,exist_ok=True)
for i in range(3,len(messages)+1,2):
data_idx += 1
with open(os.path.join(output_dir,f"{data_idx}.json"),'w') as f:
json.dump(messages[:i],f,indent=2)