import csv import requests import re, os from bs4 import BeautifulSoup url = ' https://mp.weixin.qq.com/s/K0u_qPFQtWuH4hk5K2xWfQ' response = requests.get(url) response.encoding = response.apparent_encoding response.encoding = 'utf-8' html = response.text soup = BeautifulSoup(html, 'html.parser') ans = soup.select('div.rich_media > div.rich_media_inner ') ans1 = ans[0].text.encode() def openreadtxt(file_name): data = [] file = open(file_name, 'r', encoding='utf-8') # 打开文件 file_data = file.readlines() # 读取所有行 for row in file_data: tmp_list = row.split(' ') # 按‘,’切分每行的数据 tmp_list[-1] = tmp_list[-1].replace('\n', '') # 去掉换行符 data.append(tmp_list) # 将每行数据插入data中 file.close() return data def updateFile(file, old_str, new_str): with open(file, "r", encoding="utf-8") as f1, open("%s.bak" % file, "w", encoding="utf-8") as f2: for line in f1: f2.write(re.sub(old_str, new_str, line)) os.remove(file) os.rename("%s.bak" % file, file) f2.close() f1.close() updateFile(r"test.txt", ";", "。") updateFile(r"test.txt", ":", ":") updateFile(r"test.txt", "-", "--") updateFile(r"test.txt", "确诊病例", "") updateFile(r"test.txt", "病例轨迹", "") updateFile(r"test.txt", "病例", "&&病例") updateFile(r"test.txt", "呼和浩特市应对新型冠状病毒感染", "end") data = openreadtxt('test.txt') data = str(data) result = re.findall("病例\d:.*?(?=&&|end)", data) name = [] date = [] time = [] through = [] Num = 0 for i in result: NUM = 0 f = open('20.txt', 'w', encoding="utf-8") f.write(i) f.close() data = openreadtxt("20.txt") data = str(data) name1 = re.findall("病例\d", data) getOne = re.findall("\d{1,2}月\d{1,2}?日.*?。+(?=\d{1,2}月\d{1,2}日)|\d{1,2}月\d{1,2}?日.*?。+(?=病例)", data) numname = 0 for i in getOne: NUM += 1 f = open('10.txt', 'w', encoding="utf-8") f.write(i) f.close() data = openreadtxt("10.txt") data = str(data) date1 = re.findall("\d{1,2}月\d{1,2}日+(?=,)", data)[0] date1 = date1.split() time += re.findall("\d{1,2}:\d{1,2}--\d{1,2}:\d{1,2}|\d{1,2}:\d{1,2}", data) updateFile(r"10.txt", "[0-9月\'\",:-]", "") updateFile(r"10.txt", "日,", "。") data = openreadtxt("10.txt") data = str(data) t = re.findall("(?<=。)+.*?。", data) through += t numdate = len(t) Num += numdate numname += numdate x = 0 while x < numdate: date += date1 x += 1 i = 0 name2 = name1[0].split() while i < numname: name += name2 i += 1 num = range(0, Num) rows = zip(num, name, date, date, time, through) with open("test.csv", "w", encoding='utf-8', newline="") as t: writer = csv.writer(t) writer.writerow(["", "病例", "开始日期", "结束日期", "时间", "事件"]) for row in rows: writer.writerow(row) t.close()