LargeModelCode/my code.py

import jsonimport jsonimport json

def combine_jsonl(file1, file2, output_file):
    combined_data = []

    # 读取第一个jsonl文件
    with open(file1, 'r', encoding='utf-8') as f1:
        for line in f1:
            combined_data.append(json.loads(line))

    # 读取第二个jsonl文件
    with open(file2, 'r', encoding='utf-8') as f2:
        for line in f2:
            combined_data.append(json.loads(line))

    # 将合并的数据写入新的jsonl文件
    with open(output_file, 'w', encoding='utf-8') as out:
        for entry in combined_data:
            out.write(json.dumps(entry, ensure_ascii=False) + '\n')

# 示例使用
combine_jsonl('4.jsonl', 'C:\\Users\\LENOVO\\Documents\\WeChat Files\\wxid_3kv9umjbvkr822\\FileStorage\\File\\2024-10\\3mbpp_test1.jsonl', '5.jsonl')

def combine_jsonl(file1, file2, output_file):
    combined_data = []

    # 读取第一个jsonl文件
    with open(file1, 'r', encoding='utf-8') as f1:
        for line in f1:
            combined_data.append(json.loads(line))

    # 读取第二个jsonl文件
    with open(file2, 'r', encoding='utf-8') as f2:
        for line in f2:
            combined_data.append(json.loads(line))

    # 将合并的数据写入新的jsonl文件
    with open(output_file, 'w', encoding='utf-8') as out:
        for entry in combined_data:
            out.write(json.dumps(entry, ensure_ascii=False) + '\n')

# 示例使用
combine_jsonl('4.jsonl', 'C:\\Users\\LENOVO\\Documents\\WeChat Files\\wxid_3kv9umjbvkr822\\FileStorage\\File\\2024-10\\3mbpp_test1.jsonl', '5.jsonl')


def combine_jsonl(file1, file2, output_file):
    combined_data = []

    # 读取第一个jsonl文件
    with open(file1, 'r', encoding='utf-8') as f1:
        for line in f1:
            combined_data.append(json.loads(line))

    # 读取第二个jsonl文件
    with open(file2, 'r', encoding='utf-8') as f2:
        for line in f2:
            combined_data.append(json.loads(line))

    # 将合并的数据写入新的jsonl文件
    with open(output_file, 'w', encoding='utf-8') as out:
        for entry in combined_data:
            out.write(json.dumps(entry, ensure_ascii=False) + '\n')

# 示例使用
combine_jsonl('4.jsonl', 'C:\\Users\\LENOVO\\Documents\\WeChat Files\\wxid_3kv9umjbvkr822\\FileStorage\\File\\2024-10\\3mbpp_test1.jsonl', '5.jsonl')

import pandas as pd


def parquet_to_jsonl(parquet_file, jsonl_file):
    # 使用pandas读取parquet文件
    df = pd.read_parquet(parquet_file, engine='pyarrow')

    # 将DataFrame转换为JSON Lines格式并写入文件
    with open(jsonl_file, 'w') as f:
        for index, row in df.iterrows():
            json_str = row.to_json()
            f.write(json_str + '\n')


# 使用示例
parquet_file = 'E:\\train-00000-of-00001.parquet'
jsonl_file = 'e1.jsonl'
parquet_to_jsonl(parquet_file, jsonl_file)
#print(1)

(line)
        if "output" in data:
            value = data["output"]
            # 查找特定字符串的位置
            index = value.find("\n```python")
            if index!= -1:
                # 如果找到特定字符串，保留其前面的内容
                new_value = value[:index]
                data["input"] = new_value
        f_out.write(json.dumps(data) + '\n')

# 定义两个文件的名称
source_filename = '3.jsonl'
target_filename = '1.jsonl'

# 打开源文件和目标文件
with open(source_filename, 'r',encoding='utf-8') as source_file, open(target_filename, 'a+',encoding='utf-8') as target_file:
    # 检查目标文件是否为空
    target_file.seek(0)  # 移动到文件开头
    is_target_empty = target_file.read(1) == ''

    # 如果目标文件不为空，则移动到文件末尾并添加一个换行符
    if not is_target_empty:
        target_file.seek(0, 2)  # 移动到文件末尾
        target_file.write('\n')

    # 读取源文件内容并写入目标文件
    for line in source_file:
        target_file.write(line)

RYin: 01-20 15:08:16
import json

# 输入和输出文件名
input_filename = 'code1.jsonl'
output_filename = 'code2.jsonl'

# 要删除的键
key_to_delete = "system"

# 读取输入文件，处理每个JSON对象，并写入输出文件
with open(input_filename, 'r', encoding='utf-8') as infile, open(output_filename, 'w', encoding='utf-8') as outfile:
    for line in infile:
        data = json.loads(line)
        # 如果键存在，则删除它
        if key_to_delete in data:
            del data[key_to_delete]
        # 将修改后的JSON对象写回到文件
        json.dump(data, outfile)
        outfile.write('\n')

RYin: 01-20 15:08:25
import json

# 设置文件路径
input_file_path = 'code2.jsonl'
output_file_path = 'code1.jsonl'
target_string = '```python\n'

# 处理 JSONL 文件
with open(input_file_path, 'r', encoding='utf-8') as infile, open(output_file_path, 'w', encoding='utf-8') as outfile:
    for line in infile:
        # 解析每一行 JSON
        data = json.loads(line)

        # 检查目标键是否存在
        if 'output' in data:
            # 删除特定字符串及之前的内容
            content = data['output']
            if target_string in content:
                # 找到目标字符串的位置，并截取
                content = content.split(target_string, 1)[-1]
                data['output'] = content

        # 将修改后的数据写入新的 JSONL 文件
        outfile.write(json.dumps(data) + '\n')

print("处理完成！")

import json

# 设置文件路径
input_file_path = 'code2.jsonl'
output_file_path = 'code1.jsonl'
target_string = '```python\n'

# 处理 JSONL 文件
with open(input_file_path, 'r', encoding='utf-8') as infile, open(output_file_path, 'w', encoding='utf-8') as outfile:
    for line in infile:
        # 解析每一行 JSON
        data = json.loads(line)

        # 检查目标键是否存在
        if 'output' in data:
            # 删除特定字符串及之前的内容
            content = data['output']
            if target_string in content:
                # 找到目标字符串的位置，并截取
                content = content.split(target_string, 1)[-1]
                data['output'] = content

        # 将修改后的数据写入新的 JSONL 文件
        outfile.write(json.dumps(data) + '\n')

print("处理完成！")

import json

# 读取原始jsonl文件并修改内容
with open('C:\\Users\\LENOVO\\Downloads\\output17.jsonl', 'r', encoding='utf-8') as infile, open('y1.jsonl', 'w', encoding='utf-8') as outfile:
    for line in infile:
        data = json.loads(line)
        # 检查并替换input中的第一个出现的字符串"question"
        if 'input' in data:
            data['input'] = data['input'].replace("question", "Instruction", 1)  # 只替换第一个出现的字符串
        # 写入新的jsonl文件
        outfile.write(json.dumps(data, ensure_ascii=False) + '\n')

import json

# 假设我们要将键 "old_key" 的内容更改为 "new_value"
# old_key = "old_key_name"
old_key = "task_id"
new_value = 0

# 读取JSON Lines文件
with open('5.jsonl', 'r',encoding='utf-8') as file:
    lines = file.readlines()

# 创建一个新的列表来保存修改后的JSON对象
updated_lines = []

# 遍历每一行
for line in lines:
    # 解析JSON对象
    data = json.loads(line)
    new_value = new_value + 1

    # 检查键是否存在，如果存在，则修改它
    if old_key in data:
        # 修改键的内容
        data[old_key] = str(new_value ) # 如果你只是想更新键的值，使用这行
        # data[old_key] = new_value  # 如果你只是想更新旧键的值，使用这行
        # 如果要删除旧键，可以使用 del data[old_key]

    # 将修改后的JSON对象添加回列表
    updated_lines.append(json.dumps(data))

# 将修改后的内容写回到一个新的JSON Lines文件
with open('6.jsonl', 'w', encoding='utf-8') as file:
    file.write('\n'.join(updated_lines))

print("文件已更新。")

import requests
from bs4 import BeautifulSoup
headers = {
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 SLBrowser/9.0.5.8121 SLBChan/111 SLBVPV/64-bit"
}
response = requests.get("https://hf-mirror.com/datasets/openai/openai_humaneval/viewer/openai_humaneval/test?row=0", headers=headers)
html = response.text
soup = BeautifulSoup(html, "html.parser")
# all_content = soup.select('[class="hljs-ln-code"]')
# all_content = soup.find_all("div",{"class":"htmledit_views"})
all_content = soup.find_all("div",{"class":"line-clamp-2"})
for content in all_content:
   # content_string = content.string
     # if "//" not in content.text:
    print(content.text)
# with open("./data.txt","w",encoding="utf-8") as f:
#     f.write(content.text)