You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

262 lines
8.8 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import jsonimport jsonimport json
def combine_jsonl(file1, file2, output_file):
combined_data = []
# 读取第一个jsonl文件
with open(file1, 'r', encoding='utf-8') as f1:
for line in f1:
combined_data.append(json.loads(line))
# 读取第二个jsonl文件
with open(file2, 'r', encoding='utf-8') as f2:
for line in f2:
combined_data.append(json.loads(line))
# 将合并的数据写入新的jsonl文件
with open(output_file, 'w', encoding='utf-8') as out:
for entry in combined_data:
out.write(json.dumps(entry, ensure_ascii=False) + '\n')
# 示例使用
combine_jsonl('4.jsonl', 'C:\\Users\\LENOVO\\Documents\\WeChat Files\\wxid_3kv9umjbvkr822\\FileStorage\\File\\2024-10\\3mbpp_test1.jsonl', '5.jsonl')
def combine_jsonl(file1, file2, output_file):
combined_data = []
# 读取第一个jsonl文件
with open(file1, 'r', encoding='utf-8') as f1:
for line in f1:
combined_data.append(json.loads(line))
# 读取第二个jsonl文件
with open(file2, 'r', encoding='utf-8') as f2:
for line in f2:
combined_data.append(json.loads(line))
# 将合并的数据写入新的jsonl文件
with open(output_file, 'w', encoding='utf-8') as out:
for entry in combined_data:
out.write(json.dumps(entry, ensure_ascii=False) + '\n')
# 示例使用
combine_jsonl('4.jsonl', 'C:\\Users\\LENOVO\\Documents\\WeChat Files\\wxid_3kv9umjbvkr822\\FileStorage\\File\\2024-10\\3mbpp_test1.jsonl', '5.jsonl')
def combine_jsonl(file1, file2, output_file):
combined_data = []
# 读取第一个jsonl文件
with open(file1, 'r', encoding='utf-8') as f1:
for line in f1:
combined_data.append(json.loads(line))
# 读取第二个jsonl文件
with open(file2, 'r', encoding='utf-8') as f2:
for line in f2:
combined_data.append(json.loads(line))
# 将合并的数据写入新的jsonl文件
with open(output_file, 'w', encoding='utf-8') as out:
for entry in combined_data:
out.write(json.dumps(entry, ensure_ascii=False) + '\n')
# 示例使用
combine_jsonl('4.jsonl', 'C:\\Users\\LENOVO\\Documents\\WeChat Files\\wxid_3kv9umjbvkr822\\FileStorage\\File\\2024-10\\3mbpp_test1.jsonl', '5.jsonl')
import pandas as pd
def parquet_to_jsonl(parquet_file, jsonl_file):
# 使用pandas读取parquet文件
df = pd.read_parquet(parquet_file, engine='pyarrow')
# 将DataFrame转换为JSON Lines格式并写入文件
with open(jsonl_file, 'w') as f:
for index, row in df.iterrows():
json_str = row.to_json()
f.write(json_str + '\n')
# 使用示例
parquet_file = 'E:\\train-00000-of-00001.parquet'
jsonl_file = 'e1.jsonl'
parquet_to_jsonl(parquet_file, jsonl_file)
#print(1)
(line)
if "output" in data:
value = data["output"]
# 查找特定字符串的位置
index = value.find("\n```python")
if index!= -1:
# 如果找到特定字符串,保留其前面的内容
new_value = value[:index]
data["input"] = new_value
f_out.write(json.dumps(data) + '\n')
# 定义两个文件的名称
source_filename = '3.jsonl'
target_filename = '1.jsonl'
# 打开源文件和目标文件
with open(source_filename, 'r',encoding='utf-8') as source_file, open(target_filename, 'a+',encoding='utf-8') as target_file:
# 检查目标文件是否为空
target_file.seek(0) # 移动到文件开头
is_target_empty = target_file.read(1) == ''
# 如果目标文件不为空,则移动到文件末尾并添加一个换行符
if not is_target_empty:
target_file.seek(0, 2) # 移动到文件末尾
target_file.write('\n')
# 读取源文件内容并写入目标文件
for line in source_file:
target_file.write(line)
RYin: 01-20 15:08:16
import json
# 输入和输出文件名
input_filename = 'code1.jsonl'
output_filename = 'code2.jsonl'
# 要删除的键
key_to_delete = "system"
# 读取输入文件处理每个JSON对象并写入输出文件
with open(input_filename, 'r', encoding='utf-8') as infile, open(output_filename, 'w', encoding='utf-8') as outfile:
for line in infile:
data = json.loads(line)
# 如果键存在,则删除它
if key_to_delete in data:
del data[key_to_delete]
# 将修改后的JSON对象写回到文件
json.dump(data, outfile)
outfile.write('\n')
RYin: 01-20 15:08:25
import json
# 设置文件路径
input_file_path = 'code2.jsonl'
output_file_path = 'code1.jsonl'
target_string = '```python\n'
# 处理 JSONL 文件
with open(input_file_path, 'r', encoding='utf-8') as infile, open(output_file_path, 'w', encoding='utf-8') as outfile:
for line in infile:
# 解析每一行 JSON
data = json.loads(line)
# 检查目标键是否存在
if 'output' in data:
# 删除特定字符串及之前的内容
content = data['output']
if target_string in content:
# 找到目标字符串的位置,并截取
content = content.split(target_string, 1)[-1]
data['output'] = content
# 将修改后的数据写入新的 JSONL 文件
outfile.write(json.dumps(data) + '\n')
print("处理完成!")
import json
# 设置文件路径
input_file_path = 'code2.jsonl'
output_file_path = 'code1.jsonl'
target_string = '```python\n'
# 处理 JSONL 文件
with open(input_file_path, 'r', encoding='utf-8') as infile, open(output_file_path, 'w', encoding='utf-8') as outfile:
for line in infile:
# 解析每一行 JSON
data = json.loads(line)
# 检查目标键是否存在
if 'output' in data:
# 删除特定字符串及之前的内容
content = data['output']
if target_string in content:
# 找到目标字符串的位置,并截取
content = content.split(target_string, 1)[-1]
data['output'] = content
# 将修改后的数据写入新的 JSONL 文件
outfile.write(json.dumps(data) + '\n')
print("处理完成!")
import json
# 读取原始jsonl文件并修改内容
with open('C:\\Users\\LENOVO\\Downloads\\output17.jsonl', 'r', encoding='utf-8') as infile, open('y1.jsonl', 'w', encoding='utf-8') as outfile:
for line in infile:
data = json.loads(line)
# 检查并替换input中的第一个出现的字符串"question"
if 'input' in data:
data['input'] = data['input'].replace("question", "Instruction", 1) # 只替换第一个出现的字符串
# 写入新的jsonl文件
outfile.write(json.dumps(data, ensure_ascii=False) + '\n')
import json
# 假设我们要将键 "old_key" 的内容更改为 "new_value"
# old_key = "old_key_name"
old_key = "task_id"
new_value = 0
# 读取JSON Lines文件
with open('5.jsonl', 'r',encoding='utf-8') as file:
lines = file.readlines()
# 创建一个新的列表来保存修改后的JSON对象
updated_lines = []
# 遍历每一行
for line in lines:
# 解析JSON对象
data = json.loads(line)
new_value = new_value + 1
# 检查键是否存在,如果存在,则修改它
if old_key in data:
# 修改键的内容
data[old_key] = str(new_value ) # 如果你只是想更新键的值,使用这行
# data[old_key] = new_value # 如果你只是想更新旧键的值,使用这行
# 如果要删除旧键,可以使用 del data[old_key]
# 将修改后的JSON对象添加回列表
updated_lines.append(json.dumps(data))
# 将修改后的内容写回到一个新的JSON Lines文件
with open('6.jsonl', 'w', encoding='utf-8') as file:
file.write('\n'.join(updated_lines))
print("文件已更新。")
import requests
from bs4 import BeautifulSoup
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 SLBrowser/9.0.5.8121 SLBChan/111 SLBVPV/64-bit"
}
response = requests.get("https://hf-mirror.com/datasets/openai/openai_humaneval/viewer/openai_humaneval/test?row=0", headers=headers)
html = response.text
soup = BeautifulSoup(html, "html.parser")
# all_content = soup.select('[class="hljs-ln-code"]')
# all_content = soup.find_all("div",{"class":"htmledit_views"})
all_content = soup.find_all("div",{"class":"line-clamp-2"})
for content in all_content:
# content_string = content.string
# if "//" not in content.text:
print(content.text)
# with open("./data.txt","w",encoding="utf-8") as f:
# f.write(content.text)