You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
|
|
|
|
import re
|
|
|
|
|
import csv
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
import requests
|
|
|
|
|
|
|
|
|
|
url = 'https://zhuanlan.zhihu.com/p/105570623'
|
|
|
|
|
response = requests.get(url)
|
|
|
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
|
|
|
|
|
|
# 初始化数据列表,用于存放提取的信息
|
|
|
|
|
data = []
|
|
|
|
|
|
|
|
|
|
# 假设我们要提取页面上的所有段落文本中数字加句号后面的内容
|
|
|
|
|
paragraphs = soup.find_all('p')
|
|
|
|
|
for paragraph in paragraphs:
|
|
|
|
|
text = paragraph.get_text()
|
|
|
|
|
# 使用正则表达式匹配数字加句号后面的内容
|
|
|
|
|
matches = re.findall(r'\d+\.\s*(.*)', text)
|
|
|
|
|
for match in matches:
|
|
|
|
|
# 将匹配到的内容添加到data列表中,这里假设序号是连续的,从1开始递增
|
|
|
|
|
data.append([len(data)+1, match.strip()])
|
|
|
|
|
|
|
|
|
|
# 定义CSV文件的名称
|
|
|
|
|
filename = 'extracted_data.csv'
|
|
|
|
|
|
|
|
|
|
# 写入CSV文件
|
|
|
|
|
with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
|
|
|
|
|
csvwriter = csv.writer(csvfile)
|
|
|
|
|
# 写入表头
|
|
|
|
|
csvwriter.writerow(['序号', '内容',''])
|
|
|
|
|
# 写入数据
|
|
|
|
|
for row in data:
|
|
|
|
|
# 注意,原问题没有明确"单词"来源,这里假设每条数据只有序号和内容两列
|
|
|
|
|
csvwriter.writerow(row)
|
|
|
|
|
|
|
|
|
|
print(f"数据已成功写入{filename}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|