ReciteWords/pythonProject/test.py

import re
import csv
from bs4 import BeautifulSoup
import requests

url = 'https://zhuanlan.zhihu.com/p/105570623'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# 初始化数据列表，用于存放提取的信息
data = []

# 假设我们要提取页面上的所有段落文本中数字加句号后面的内容
paragraphs = soup.find_all('p')
for paragraph in paragraphs:
    text = paragraph.get_text()
    # 使用正则表达式匹配数字加句号后面的内容
    matches = re.findall(r'\d+\.\s*(.*)', text)
    for match in matches:
        # 将匹配到的内容添加到data列表中，这里假设序号是连续的，从1开始递增
        data.append([len(data)+1, match.strip()])

# 定义CSV文件的名称
filename = 'extracted_data.csv'

# 写入CSV文件
with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
    csvwriter = csv.writer(csvfile)
    # 写入表头
    csvwriter.writerow(['序号', '内容',''])
    # 写入数据
    for row in data:
        # 注意，原问题没有明确"单词"来源，这里假设每条数据只有序号和内容两列
        csvwriter.writerow(row)

print(f"数据已成功写入{filename}")