import jsonimport jsonimport json def combine_jsonl(file1, file2, output_file): combined_data = [] # 读取第一个jsonl文件 with open(file1, 'r', encoding='utf-8') as f1: for line in f1: combined_data.append(json.loads(line)) # 读取第二个jsonl文件 with open(file2, 'r', encoding='utf-8') as f2: for line in f2: combined_data.append(json.loads(line)) # 将合并的数据写入新的jsonl文件 with open(output_file, 'w', encoding='utf-8') as out: for entry in combined_data: out.write(json.dumps(entry, ensure_ascii=False) + '\n') # 示例使用 combine_jsonl('4.jsonl', 'C:\\Users\\LENOVO\\Documents\\WeChat Files\\wxid_3kv9umjbvkr822\\FileStorage\\File\\2024-10\\3mbpp_test1.jsonl', '5.jsonl') def combine_jsonl(file1, file2, output_file): combined_data = [] # 读取第一个jsonl文件 with open(file1, 'r', encoding='utf-8') as f1: for line in f1: combined_data.append(json.loads(line)) # 读取第二个jsonl文件 with open(file2, 'r', encoding='utf-8') as f2: for line in f2: combined_data.append(json.loads(line)) # 将合并的数据写入新的jsonl文件 with open(output_file, 'w', encoding='utf-8') as out: for entry in combined_data: out.write(json.dumps(entry, ensure_ascii=False) + '\n') # 示例使用 combine_jsonl('4.jsonl', 'C:\\Users\\LENOVO\\Documents\\WeChat Files\\wxid_3kv9umjbvkr822\\FileStorage\\File\\2024-10\\3mbpp_test1.jsonl', '5.jsonl') def combine_jsonl(file1, file2, output_file): combined_data = [] # 读取第一个jsonl文件 with open(file1, 'r', encoding='utf-8') as f1: for line in f1: combined_data.append(json.loads(line)) # 读取第二个jsonl文件 with open(file2, 'r', encoding='utf-8') as f2: for line in f2: combined_data.append(json.loads(line)) # 将合并的数据写入新的jsonl文件 with open(output_file, 'w', encoding='utf-8') as out: for entry in combined_data: out.write(json.dumps(entry, ensure_ascii=False) + '\n') # 示例使用 combine_jsonl('4.jsonl', 'C:\\Users\\LENOVO\\Documents\\WeChat Files\\wxid_3kv9umjbvkr822\\FileStorage\\File\\2024-10\\3mbpp_test1.jsonl', '5.jsonl') import pandas as pd def parquet_to_jsonl(parquet_file, jsonl_file): # 使用pandas读取parquet文件 df = pd.read_parquet(parquet_file, engine='pyarrow') # 将DataFrame转换为JSON Lines格式并写入文件 with open(jsonl_file, 'w') as f: for index, row in df.iterrows(): json_str = row.to_json() f.write(json_str + '\n') # 使用示例 parquet_file = 'E:\\train-00000-of-00001.parquet' jsonl_file = 'e1.jsonl' parquet_to_jsonl(parquet_file, jsonl_file) #print(1) (line) if "output" in data: value = data["output"] # 查找特定字符串的位置 index = value.find("\n```python") if index!= -1: # 如果找到特定字符串,保留其前面的内容 new_value = value[:index] data["input"] = new_value f_out.write(json.dumps(data) + '\n') # 定义两个文件的名称 source_filename = '3.jsonl' target_filename = '1.jsonl' # 打开源文件和目标文件 with open(source_filename, 'r',encoding='utf-8') as source_file, open(target_filename, 'a+',encoding='utf-8') as target_file: # 检查目标文件是否为空 target_file.seek(0) # 移动到文件开头 is_target_empty = target_file.read(1) == '' # 如果目标文件不为空,则移动到文件末尾并添加一个换行符 if not is_target_empty: target_file.seek(0, 2) # 移动到文件末尾 target_file.write('\n') # 读取源文件内容并写入目标文件 for line in source_file: target_file.write(line) RYin: 01-20 15:08:16 import json # 输入和输出文件名 input_filename = 'code1.jsonl' output_filename = 'code2.jsonl' # 要删除的键 key_to_delete = "system" # 读取输入文件,处理每个JSON对象,并写入输出文件 with open(input_filename, 'r', encoding='utf-8') as infile, open(output_filename, 'w', encoding='utf-8') as outfile: for line in infile: data = json.loads(line) # 如果键存在,则删除它 if key_to_delete in data: del data[key_to_delete] # 将修改后的JSON对象写回到文件 json.dump(data, outfile) outfile.write('\n') RYin: 01-20 15:08:25 import json # 设置文件路径 input_file_path = 'code2.jsonl' output_file_path = 'code1.jsonl' target_string = '```python\n' # 处理 JSONL 文件 with open(input_file_path, 'r', encoding='utf-8') as infile, open(output_file_path, 'w', encoding='utf-8') as outfile: for line in infile: # 解析每一行 JSON data = json.loads(line) # 检查目标键是否存在 if 'output' in data: # 删除特定字符串及之前的内容 content = data['output'] if target_string in content: # 找到目标字符串的位置,并截取 content = content.split(target_string, 1)[-1] data['output'] = content # 将修改后的数据写入新的 JSONL 文件 outfile.write(json.dumps(data) + '\n') print("处理完成!") import json # 设置文件路径 input_file_path = 'code2.jsonl' output_file_path = 'code1.jsonl' target_string = '```python\n' # 处理 JSONL 文件 with open(input_file_path, 'r', encoding='utf-8') as infile, open(output_file_path, 'w', encoding='utf-8') as outfile: for line in infile: # 解析每一行 JSON data = json.loads(line) # 检查目标键是否存在 if 'output' in data: # 删除特定字符串及之前的内容 content = data['output'] if target_string in content: # 找到目标字符串的位置,并截取 content = content.split(target_string, 1)[-1] data['output'] = content # 将修改后的数据写入新的 JSONL 文件 outfile.write(json.dumps(data) + '\n') print("处理完成!") import json # 读取原始jsonl文件并修改内容 with open('C:\\Users\\LENOVO\\Downloads\\output17.jsonl', 'r', encoding='utf-8') as infile, open('y1.jsonl', 'w', encoding='utf-8') as outfile: for line in infile: data = json.loads(line) # 检查并替换input中的第一个出现的字符串"question" if 'input' in data: data['input'] = data['input'].replace("question", "Instruction", 1) # 只替换第一个出现的字符串 # 写入新的jsonl文件 outfile.write(json.dumps(data, ensure_ascii=False) + '\n') import json # 假设我们要将键 "old_key" 的内容更改为 "new_value" # old_key = "old_key_name" old_key = "task_id" new_value = 0 # 读取JSON Lines文件 with open('5.jsonl', 'r',encoding='utf-8') as file: lines = file.readlines() # 创建一个新的列表来保存修改后的JSON对象 updated_lines = [] # 遍历每一行 for line in lines: # 解析JSON对象 data = json.loads(line) new_value = new_value + 1 # 检查键是否存在,如果存在,则修改它 if old_key in data: # 修改键的内容 data[old_key] = str(new_value ) # 如果你只是想更新键的值,使用这行 # data[old_key] = new_value # 如果你只是想更新旧键的值,使用这行 # 如果要删除旧键,可以使用 del data[old_key] # 将修改后的JSON对象添加回列表 updated_lines.append(json.dumps(data)) # 将修改后的内容写回到一个新的JSON Lines文件 with open('6.jsonl', 'w', encoding='utf-8') as file: file.write('\n'.join(updated_lines)) print("文件已更新。") import requests from bs4 import BeautifulSoup headers = { "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 SLBrowser/9.0.5.8121 SLBChan/111 SLBVPV/64-bit" } response = requests.get("https://hf-mirror.com/datasets/openai/openai_humaneval/viewer/openai_humaneval/test?row=0", headers=headers) html = response.text soup = BeautifulSoup(html, "html.parser") # all_content = soup.select('[class="hljs-ln-code"]') # all_content = soup.find_all("div",{"class":"htmledit_views"}) all_content = soup.find_all("div",{"class":"line-clamp-2"}) for content in all_content: # content_string = content.string # if "//" not in content.text: print(content.text) # with open("./data.txt","w",encoding="utf-8") as f: # f.write(content.text)