diff --git a/getdata.py b/getdata.py new file mode 100644 index 0000000..6368bbe --- /dev/null +++ b/getdata.py @@ -0,0 +1,95 @@ +import csv +import requests +import re, os +from bs4 import BeautifulSoup + +url = ' https://mp.weixin.qq.com/s/K0u_qPFQtWuH4hk5K2xWfQ' +response = requests.get(url) +response.encoding = response.apparent_encoding +response.encoding = 'utf-8' +html = response.text +soup = BeautifulSoup(html, 'html.parser') +ans = soup.select('div.rich_media > div.rich_media_inner ') +ans1 = ans[0].text.encode() + +def openreadtxt(file_name): + data = [] + file = open(file_name, 'r', encoding='utf-8') # 打开文件 + file_data = file.readlines() # 读取所有行 + for row in file_data: + tmp_list = row.split(' ') # 按‘,’切分每行的数据 + tmp_list[-1] = tmp_list[-1].replace('\n', '') # 去掉换行符 + data.append(tmp_list) # 将每行数据插入data中 + file.close() + return data + +def updateFile(file, old_str, new_str): + with open(file, "r", encoding="utf-8") as f1, open("%s.bak" % file, "w", encoding="utf-8") as f2: + for line in f1: + f2.write(re.sub(old_str, new_str, line)) + os.remove(file) + os.rename("%s.bak" % file, file) + f2.close() + f1.close() + +updateFile(r"test.txt", ";", "。") +updateFile(r"test.txt", ":", ":") +updateFile(r"test.txt", "-", "--") +updateFile(r"test.txt", "确诊病例", "") +updateFile(r"test.txt", "病例轨迹", "") +updateFile(r"test.txt", "病例", "&&病例") +updateFile(r"test.txt", "呼和浩特市应对新型冠状病毒感染", "end") +data = openreadtxt('test.txt') +data = str(data) +result = re.findall("病例\d:.*?(?=&&|end)", data) +name = [] +date = [] +time = [] +through = [] +Num = 0 +for i in result: + NUM = 0 + f = open('20.txt', 'w', encoding="utf-8") + f.write(i) + f.close() + data = openreadtxt("20.txt") + data = str(data) + name1 = re.findall("病例\d", data) + getOne = re.findall("\d{1,2}月\d{1,2}?日.*?。+(?=\d{1,2}月\d{1,2}日)|\d{1,2}月\d{1,2}?日.*?。+(?=病例)", data) + numname = 0 + for i in getOne: + NUM += 1 + f = open('10.txt', 'w', encoding="utf-8") + f.write(i) + f.close() + data = openreadtxt("10.txt") + data = str(data) + date1 = re.findall("\d{1,2}月\d{1,2}日+(?=,)", data)[0] + date1 = date1.split() + time += re.findall("\d{1,2}:\d{1,2}--\d{1,2}:\d{1,2}|\d{1,2}:\d{1,2}", data) + updateFile(r"10.txt", "[0-9月\'\",:-]", "") + updateFile(r"10.txt", "日,", "。") + data = openreadtxt("10.txt") + data = str(data) + t = re.findall("(?<=。)+.*?。", data) + through += t + numdate = len(t) + Num += numdate + numname += numdate + x = 0 + while x < numdate: + date += date1 + x += 1 + i = 0 + name2 = name1[0].split() + while i < numname: + name += name2 + i += 1 +num = range(0, Num) +rows = zip(num, name, date, date, time, through) +with open("test.csv", "w", encoding='utf-8', newline="") as t: + writer = csv.writer(t) + writer.writerow(["", "病例", "开始日期", "结束日期", "时间", "事件"]) + for row in rows: + writer.writerow(row) +t.close() \ No newline at end of file