You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

40 lines
1.2 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import re
import csv
from bs4 import BeautifulSoup
import requests
url = 'https://zhuanlan.zhihu.com/p/105570623'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
# 初始化数据列表,用于存放提取的信息
data = []
# 假设我们要提取页面上的所有段落文本中数字加句号后面的内容
paragraphs = soup.find_all('p')
for paragraph in paragraphs:
text = paragraph.get_text()
# 使用正则表达式匹配数字加句号后面的内容
matches = re.findall(r'\d+\.\s*(.*)', text)
for match in matches:
# 将匹配到的内容添加到data列表中这里假设序号是连续的从1开始递增
data.append([len(data)+1, match.strip()])
# 定义CSV文件的名称
filename = 'extracted_data.csv'
# 写入CSV文件
with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
csvwriter = csv.writer(csvfile)
# 写入表头
csvwriter.writerow(['序号', '内容',''])
# 写入数据
for row in data:
# 注意,原问题没有明确"单词"来源,这里假设每条数据只有序号和内容两列
csvwriter.writerow(row)
print(f"数据已成功写入{filename}")