You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

40 lines
1.2 KiB

9 months ago
import re
import csv
from bs4 import BeautifulSoup
import requests
url = 'https://zhuanlan.zhihu.com/p/105570623'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
# 初始化数据列表,用于存放提取的信息
data = []
# 假设我们要提取页面上的所有段落文本中数字加句号后面的内容
paragraphs = soup.find_all('p')
for paragraph in paragraphs:
text = paragraph.get_text()
# 使用正则表达式匹配数字加句号后面的内容
matches = re.findall(r'\d+\.\s*(.*)', text)
for match in matches:
# 将匹配到的内容添加到data列表中这里假设序号是连续的从1开始递增
data.append([len(data)+1, match.strip()])
# 定义CSV文件的名称
filename = 'extracted_data.csv'
# 写入CSV文件
with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
csvwriter = csv.writer(csvfile)
# 写入表头
9 months ago
csvwriter.writerow(['序号', '内容',''])
9 months ago
# 写入数据
for row in data:
# 注意,原问题没有明确"单词"来源,这里假设每条数据只有序号和内容两列
csvwriter.writerow(row)
print(f"数据已成功写入{filename}")