|
|
import jsonimport jsonimport json
|
|
|
|
|
|
def combine_jsonl(file1, file2, output_file):
|
|
|
combined_data = []
|
|
|
|
|
|
# 读取第一个jsonl文件
|
|
|
with open(file1, 'r', encoding='utf-8') as f1:
|
|
|
for line in f1:
|
|
|
combined_data.append(json.loads(line))
|
|
|
|
|
|
# 读取第二个jsonl文件
|
|
|
with open(file2, 'r', encoding='utf-8') as f2:
|
|
|
for line in f2:
|
|
|
combined_data.append(json.loads(line))
|
|
|
|
|
|
# 将合并的数据写入新的jsonl文件
|
|
|
with open(output_file, 'w', encoding='utf-8') as out:
|
|
|
for entry in combined_data:
|
|
|
out.write(json.dumps(entry, ensure_ascii=False) + '\n')
|
|
|
|
|
|
# 示例使用
|
|
|
combine_jsonl('4.jsonl', 'C:\\Users\\LENOVO\\Documents\\WeChat Files\\wxid_3kv9umjbvkr822\\FileStorage\\File\\2024-10\\3mbpp_test1.jsonl', '5.jsonl')
|
|
|
|
|
|
def combine_jsonl(file1, file2, output_file):
|
|
|
combined_data = []
|
|
|
|
|
|
# 读取第一个jsonl文件
|
|
|
with open(file1, 'r', encoding='utf-8') as f1:
|
|
|
for line in f1:
|
|
|
combined_data.append(json.loads(line))
|
|
|
|
|
|
# 读取第二个jsonl文件
|
|
|
with open(file2, 'r', encoding='utf-8') as f2:
|
|
|
for line in f2:
|
|
|
combined_data.append(json.loads(line))
|
|
|
|
|
|
# 将合并的数据写入新的jsonl文件
|
|
|
with open(output_file, 'w', encoding='utf-8') as out:
|
|
|
for entry in combined_data:
|
|
|
out.write(json.dumps(entry, ensure_ascii=False) + '\n')
|
|
|
|
|
|
# 示例使用
|
|
|
combine_jsonl('4.jsonl', 'C:\\Users\\LENOVO\\Documents\\WeChat Files\\wxid_3kv9umjbvkr822\\FileStorage\\File\\2024-10\\3mbpp_test1.jsonl', '5.jsonl')
|
|
|
|
|
|
|
|
|
def combine_jsonl(file1, file2, output_file):
|
|
|
combined_data = []
|
|
|
|
|
|
# 读取第一个jsonl文件
|
|
|
with open(file1, 'r', encoding='utf-8') as f1:
|
|
|
for line in f1:
|
|
|
combined_data.append(json.loads(line))
|
|
|
|
|
|
# 读取第二个jsonl文件
|
|
|
with open(file2, 'r', encoding='utf-8') as f2:
|
|
|
for line in f2:
|
|
|
combined_data.append(json.loads(line))
|
|
|
|
|
|
# 将合并的数据写入新的jsonl文件
|
|
|
with open(output_file, 'w', encoding='utf-8') as out:
|
|
|
for entry in combined_data:
|
|
|
out.write(json.dumps(entry, ensure_ascii=False) + '\n')
|
|
|
|
|
|
# 示例使用
|
|
|
combine_jsonl('4.jsonl', 'C:\\Users\\LENOVO\\Documents\\WeChat Files\\wxid_3kv9umjbvkr822\\FileStorage\\File\\2024-10\\3mbpp_test1.jsonl', '5.jsonl')
|
|
|
|
|
|
import pandas as pd
|
|
|
|
|
|
|
|
|
def parquet_to_jsonl(parquet_file, jsonl_file):
|
|
|
# 使用pandas读取parquet文件
|
|
|
df = pd.read_parquet(parquet_file, engine='pyarrow')
|
|
|
|
|
|
# 将DataFrame转换为JSON Lines格式并写入文件
|
|
|
with open(jsonl_file, 'w') as f:
|
|
|
for index, row in df.iterrows():
|
|
|
json_str = row.to_json()
|
|
|
f.write(json_str + '\n')
|
|
|
|
|
|
|
|
|
# 使用示例
|
|
|
parquet_file = 'E:\\train-00000-of-00001.parquet'
|
|
|
jsonl_file = 'e1.jsonl'
|
|
|
parquet_to_jsonl(parquet_file, jsonl_file)
|
|
|
#print(1)
|
|
|
|
|
|
(line)
|
|
|
if "output" in data:
|
|
|
value = data["output"]
|
|
|
# 查找特定字符串的位置
|
|
|
index = value.find("\n```python")
|
|
|
if index!= -1:
|
|
|
# 如果找到特定字符串,保留其前面的内容
|
|
|
new_value = value[:index]
|
|
|
data["input"] = new_value
|
|
|
f_out.write(json.dumps(data) + '\n')
|
|
|
|
|
|
# 定义两个文件的名称
|
|
|
source_filename = '3.jsonl'
|
|
|
target_filename = '1.jsonl'
|
|
|
|
|
|
# 打开源文件和目标文件
|
|
|
with open(source_filename, 'r',encoding='utf-8') as source_file, open(target_filename, 'a+',encoding='utf-8') as target_file:
|
|
|
# 检查目标文件是否为空
|
|
|
target_file.seek(0) # 移动到文件开头
|
|
|
is_target_empty = target_file.read(1) == ''
|
|
|
|
|
|
# 如果目标文件不为空,则移动到文件末尾并添加一个换行符
|
|
|
if not is_target_empty:
|
|
|
target_file.seek(0, 2) # 移动到文件末尾
|
|
|
target_file.write('\n')
|
|
|
|
|
|
# 读取源文件内容并写入目标文件
|
|
|
for line in source_file:
|
|
|
target_file.write(line)
|
|
|
|
|
|
RYin: 01-20 15:08:16
|
|
|
import json
|
|
|
|
|
|
# 输入和输出文件名
|
|
|
input_filename = 'code1.jsonl'
|
|
|
output_filename = 'code2.jsonl'
|
|
|
|
|
|
# 要删除的键
|
|
|
key_to_delete = "system"
|
|
|
|
|
|
# 读取输入文件,处理每个JSON对象,并写入输出文件
|
|
|
with open(input_filename, 'r', encoding='utf-8') as infile, open(output_filename, 'w', encoding='utf-8') as outfile:
|
|
|
for line in infile:
|
|
|
data = json.loads(line)
|
|
|
# 如果键存在,则删除它
|
|
|
if key_to_delete in data:
|
|
|
del data[key_to_delete]
|
|
|
# 将修改后的JSON对象写回到文件
|
|
|
json.dump(data, outfile)
|
|
|
outfile.write('\n')
|
|
|
|
|
|
RYin: 01-20 15:08:25
|
|
|
import json
|
|
|
|
|
|
# 设置文件路径
|
|
|
input_file_path = 'code2.jsonl'
|
|
|
output_file_path = 'code1.jsonl'
|
|
|
target_string = '```python\n'
|
|
|
|
|
|
# 处理 JSONL 文件
|
|
|
with open(input_file_path, 'r', encoding='utf-8') as infile, open(output_file_path, 'w', encoding='utf-8') as outfile:
|
|
|
for line in infile:
|
|
|
# 解析每一行 JSON
|
|
|
data = json.loads(line)
|
|
|
|
|
|
# 检查目标键是否存在
|
|
|
if 'output' in data:
|
|
|
# 删除特定字符串及之前的内容
|
|
|
content = data['output']
|
|
|
if target_string in content:
|
|
|
# 找到目标字符串的位置,并截取
|
|
|
content = content.split(target_string, 1)[-1]
|
|
|
data['output'] = content
|
|
|
|
|
|
# 将修改后的数据写入新的 JSONL 文件
|
|
|
outfile.write(json.dumps(data) + '\n')
|
|
|
|
|
|
print("处理完成!")
|
|
|
|
|
|
import json
|
|
|
|
|
|
# 设置文件路径
|
|
|
input_file_path = 'code2.jsonl'
|
|
|
output_file_path = 'code1.jsonl'
|
|
|
target_string = '```python\n'
|
|
|
|
|
|
# 处理 JSONL 文件
|
|
|
with open(input_file_path, 'r', encoding='utf-8') as infile, open(output_file_path, 'w', encoding='utf-8') as outfile:
|
|
|
for line in infile:
|
|
|
# 解析每一行 JSON
|
|
|
data = json.loads(line)
|
|
|
|
|
|
# 检查目标键是否存在
|
|
|
if 'output' in data:
|
|
|
# 删除特定字符串及之前的内容
|
|
|
content = data['output']
|
|
|
if target_string in content:
|
|
|
# 找到目标字符串的位置,并截取
|
|
|
content = content.split(target_string, 1)[-1]
|
|
|
data['output'] = content
|
|
|
|
|
|
# 将修改后的数据写入新的 JSONL 文件
|
|
|
outfile.write(json.dumps(data) + '\n')
|
|
|
|
|
|
print("处理完成!")
|
|
|
|
|
|
import json
|
|
|
|
|
|
# 读取原始jsonl文件并修改内容
|
|
|
with open('C:\\Users\\LENOVO\\Downloads\\output17.jsonl', 'r', encoding='utf-8') as infile, open('y1.jsonl', 'w', encoding='utf-8') as outfile:
|
|
|
for line in infile:
|
|
|
data = json.loads(line)
|
|
|
# 检查并替换input中的第一个出现的字符串"question"
|
|
|
if 'input' in data:
|
|
|
data['input'] = data['input'].replace("question", "Instruction", 1) # 只替换第一个出现的字符串
|
|
|
# 写入新的jsonl文件
|
|
|
outfile.write(json.dumps(data, ensure_ascii=False) + '\n')
|
|
|
|
|
|
import json
|
|
|
|
|
|
# 假设我们要将键 "old_key" 的内容更改为 "new_value"
|
|
|
# old_key = "old_key_name"
|
|
|
old_key = "task_id"
|
|
|
new_value = 0
|
|
|
|
|
|
# 读取JSON Lines文件
|
|
|
with open('5.jsonl', 'r',encoding='utf-8') as file:
|
|
|
lines = file.readlines()
|
|
|
|
|
|
# 创建一个新的列表来保存修改后的JSON对象
|
|
|
updated_lines = []
|
|
|
|
|
|
# 遍历每一行
|
|
|
for line in lines:
|
|
|
# 解析JSON对象
|
|
|
data = json.loads(line)
|
|
|
new_value = new_value + 1
|
|
|
|
|
|
# 检查键是否存在,如果存在,则修改它
|
|
|
if old_key in data:
|
|
|
# 修改键的内容
|
|
|
data[old_key] = str(new_value ) # 如果你只是想更新键的值,使用这行
|
|
|
# data[old_key] = new_value # 如果你只是想更新旧键的值,使用这行
|
|
|
# 如果要删除旧键,可以使用 del data[old_key]
|
|
|
|
|
|
# 将修改后的JSON对象添加回列表
|
|
|
updated_lines.append(json.dumps(data))
|
|
|
|
|
|
# 将修改后的内容写回到一个新的JSON Lines文件
|
|
|
with open('6.jsonl', 'w', encoding='utf-8') as file:
|
|
|
file.write('\n'.join(updated_lines))
|
|
|
|
|
|
print("文件已更新。")
|
|
|
|
|
|
import requests
|
|
|
from bs4 import BeautifulSoup
|
|
|
headers = {
|
|
|
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 SLBrowser/9.0.5.8121 SLBChan/111 SLBVPV/64-bit"
|
|
|
}
|
|
|
response = requests.get("https://hf-mirror.com/datasets/openai/openai_humaneval/viewer/openai_humaneval/test?row=0", headers=headers)
|
|
|
html = response.text
|
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
|
# all_content = soup.select('[class="hljs-ln-code"]')
|
|
|
# all_content = soup.find_all("div",{"class":"htmledit_views"})
|
|
|
all_content = soup.find_all("div",{"class":"line-clamp-2"})
|
|
|
for content in all_content:
|
|
|
# content_string = content.string
|
|
|
# if "//" not in content.text:
|
|
|
print(content.text)
|
|
|
# with open("./data.txt","w",encoding="utf-8") as f:
|
|
|
# f.write(content.text)
|
|
|
|
|
|
|
|
|
|
|
|
|