From a2b4b0dd047f7b15b2b69fb7bf41d4e2cb185be7 Mon Sep 17 00:00:00 2001 From: paxflsu4r <198028451@qq.com> Date: Mon, 20 Jan 2025 17:15:36 +0800 Subject: [PATCH] ADD file via upload --- my code.py | 261 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 261 insertions(+) create mode 100644 my code.py diff --git a/my code.py b/my code.py new file mode 100644 index 0000000..20bc978 --- /dev/null +++ b/my code.py @@ -0,0 +1,261 @@ +import jsonimport jsonimport json + +def combine_jsonl(file1, file2, output_file): + combined_data = [] + + # 读取第一个jsonl文件 + with open(file1, 'r', encoding='utf-8') as f1: + for line in f1: + combined_data.append(json.loads(line)) + + # 读取第二个jsonl文件 + with open(file2, 'r', encoding='utf-8') as f2: + for line in f2: + combined_data.append(json.loads(line)) + + # 将合并的数据写入新的jsonl文件 + with open(output_file, 'w', encoding='utf-8') as out: + for entry in combined_data: + out.write(json.dumps(entry, ensure_ascii=False) + '\n') + +# 示例使用 +combine_jsonl('4.jsonl', 'C:\\Users\\LENOVO\\Documents\\WeChat Files\\wxid_3kv9umjbvkr822\\FileStorage\\File\\2024-10\\3mbpp_test1.jsonl', '5.jsonl') + +def combine_jsonl(file1, file2, output_file): + combined_data = [] + + # 读取第一个jsonl文件 + with open(file1, 'r', encoding='utf-8') as f1: + for line in f1: + combined_data.append(json.loads(line)) + + # 读取第二个jsonl文件 + with open(file2, 'r', encoding='utf-8') as f2: + for line in f2: + combined_data.append(json.loads(line)) + + # 将合并的数据写入新的jsonl文件 + with open(output_file, 'w', encoding='utf-8') as out: + for entry in combined_data: + out.write(json.dumps(entry, ensure_ascii=False) + '\n') + +# 示例使用 +combine_jsonl('4.jsonl', 'C:\\Users\\LENOVO\\Documents\\WeChat Files\\wxid_3kv9umjbvkr822\\FileStorage\\File\\2024-10\\3mbpp_test1.jsonl', '5.jsonl') + + +def combine_jsonl(file1, file2, output_file): + combined_data = [] + + # 读取第一个jsonl文件 + with open(file1, 'r', encoding='utf-8') as f1: + for line in f1: + combined_data.append(json.loads(line)) + + # 读取第二个jsonl文件 + with open(file2, 'r', encoding='utf-8') as f2: + for line in f2: + combined_data.append(json.loads(line)) + + # 将合并的数据写入新的jsonl文件 + with open(output_file, 'w', encoding='utf-8') as out: + for entry in combined_data: + out.write(json.dumps(entry, ensure_ascii=False) + '\n') + +# 示例使用 +combine_jsonl('4.jsonl', 'C:\\Users\\LENOVO\\Documents\\WeChat Files\\wxid_3kv9umjbvkr822\\FileStorage\\File\\2024-10\\3mbpp_test1.jsonl', '5.jsonl') + +import pandas as pd + + +def parquet_to_jsonl(parquet_file, jsonl_file): + # 使用pandas读取parquet文件 + df = pd.read_parquet(parquet_file, engine='pyarrow') + + # 将DataFrame转换为JSON Lines格式并写入文件 + with open(jsonl_file, 'w') as f: + for index, row in df.iterrows(): + json_str = row.to_json() + f.write(json_str + '\n') + + +# 使用示例 +parquet_file = 'E:\\train-00000-of-00001.parquet' +jsonl_file = 'e1.jsonl' +parquet_to_jsonl(parquet_file, jsonl_file) +#print(1) + +(line) + if "output" in data: + value = data["output"] + # 查找特定字符串的位置 + index = value.find("\n```python") + if index!= -1: + # 如果找到特定字符串,保留其前面的内容 + new_value = value[:index] + data["input"] = new_value + f_out.write(json.dumps(data) + '\n') + +# 定义两个文件的名称 +source_filename = '3.jsonl' +target_filename = '1.jsonl' + +# 打开源文件和目标文件 +with open(source_filename, 'r',encoding='utf-8') as source_file, open(target_filename, 'a+',encoding='utf-8') as target_file: + # 检查目标文件是否为空 + target_file.seek(0) # 移动到文件开头 + is_target_empty = target_file.read(1) == '' + + # 如果目标文件不为空,则移动到文件末尾并添加一个换行符 + if not is_target_empty: + target_file.seek(0, 2) # 移动到文件末尾 + target_file.write('\n') + + # 读取源文件内容并写入目标文件 + for line in source_file: + target_file.write(line) + +RYin: 01-20 15:08:16 +import json + +# 输入和输出文件名 +input_filename = 'code1.jsonl' +output_filename = 'code2.jsonl' + +# 要删除的键 +key_to_delete = "system" + +# 读取输入文件,处理每个JSON对象,并写入输出文件 +with open(input_filename, 'r', encoding='utf-8') as infile, open(output_filename, 'w', encoding='utf-8') as outfile: + for line in infile: + data = json.loads(line) + # 如果键存在,则删除它 + if key_to_delete in data: + del data[key_to_delete] + # 将修改后的JSON对象写回到文件 + json.dump(data, outfile) + outfile.write('\n') + +RYin: 01-20 15:08:25 +import json + +# 设置文件路径 +input_file_path = 'code2.jsonl' +output_file_path = 'code1.jsonl' +target_string = '```python\n' + +# 处理 JSONL 文件 +with open(input_file_path, 'r', encoding='utf-8') as infile, open(output_file_path, 'w', encoding='utf-8') as outfile: + for line in infile: + # 解析每一行 JSON + data = json.loads(line) + + # 检查目标键是否存在 + if 'output' in data: + # 删除特定字符串及之前的内容 + content = data['output'] + if target_string in content: + # 找到目标字符串的位置,并截取 + content = content.split(target_string, 1)[-1] + data['output'] = content + + # 将修改后的数据写入新的 JSONL 文件 + outfile.write(json.dumps(data) + '\n') + +print("处理完成!") + +import json + +# 设置文件路径 +input_file_path = 'code2.jsonl' +output_file_path = 'code1.jsonl' +target_string = '```python\n' + +# 处理 JSONL 文件 +with open(input_file_path, 'r', encoding='utf-8') as infile, open(output_file_path, 'w', encoding='utf-8') as outfile: + for line in infile: + # 解析每一行 JSON + data = json.loads(line) + + # 检查目标键是否存在 + if 'output' in data: + # 删除特定字符串及之前的内容 + content = data['output'] + if target_string in content: + # 找到目标字符串的位置,并截取 + content = content.split(target_string, 1)[-1] + data['output'] = content + + # 将修改后的数据写入新的 JSONL 文件 + outfile.write(json.dumps(data) + '\n') + +print("处理完成!") + +import json + +# 读取原始jsonl文件并修改内容 +with open('C:\\Users\\LENOVO\\Downloads\\output17.jsonl', 'r', encoding='utf-8') as infile, open('y1.jsonl', 'w', encoding='utf-8') as outfile: + for line in infile: + data = json.loads(line) + # 检查并替换input中的第一个出现的字符串"question" + if 'input' in data: + data['input'] = data['input'].replace("question", "Instruction", 1) # 只替换第一个出现的字符串 + # 写入新的jsonl文件 + outfile.write(json.dumps(data, ensure_ascii=False) + '\n') + +import json + +# 假设我们要将键 "old_key" 的内容更改为 "new_value" +# old_key = "old_key_name" +old_key = "task_id" +new_value = 0 + +# 读取JSON Lines文件 +with open('5.jsonl', 'r',encoding='utf-8') as file: + lines = file.readlines() + +# 创建一个新的列表来保存修改后的JSON对象 +updated_lines = [] + +# 遍历每一行 +for line in lines: + # 解析JSON对象 + data = json.loads(line) + new_value = new_value + 1 + + # 检查键是否存在,如果存在,则修改它 + if old_key in data: + # 修改键的内容 + data[old_key] = str(new_value ) # 如果你只是想更新键的值,使用这行 + # data[old_key] = new_value # 如果你只是想更新旧键的值,使用这行 + # 如果要删除旧键,可以使用 del data[old_key] + + # 将修改后的JSON对象添加回列表 + updated_lines.append(json.dumps(data)) + +# 将修改后的内容写回到一个新的JSON Lines文件 +with open('6.jsonl', 'w', encoding='utf-8') as file: + file.write('\n'.join(updated_lines)) + +print("文件已更新。") + +import requests +from bs4 import BeautifulSoup +headers = { + "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 SLBrowser/9.0.5.8121 SLBChan/111 SLBVPV/64-bit" +} +response = requests.get("https://hf-mirror.com/datasets/openai/openai_humaneval/viewer/openai_humaneval/test?row=0", headers=headers) +html = response.text +soup = BeautifulSoup(html, "html.parser") +# all_content = soup.select('[class="hljs-ln-code"]') +# all_content = soup.find_all("div",{"class":"htmledit_views"}) +all_content = soup.find_all("div",{"class":"line-clamp-2"}) +for content in all_content: + # content_string = content.string + # if "//" not in content.text: + print(content.text) +# with open("./data.txt","w",encoding="utf-8") as f: +# f.write(content.text) + + + +