|
|
|
|
import csv
|
|
|
|
|
import requests
|
|
|
|
|
import re, os
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
|
|
url = ' https://mp.weixin.qq.com/s/K0u_qPFQtWuH4hk5K2xWfQ'
|
|
|
|
|
response = requests.get(url)
|
|
|
|
|
response.encoding = response.apparent_encoding
|
|
|
|
|
response.encoding = 'utf-8'
|
|
|
|
|
html = response.text
|
|
|
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
|
|
|
ans = soup.select('div.rich_media > div.rich_media_inner ')
|
|
|
|
|
ans1 = ans[0].text.encode()
|
|
|
|
|
|
|
|
|
|
def openreadtxt(file_name):
|
|
|
|
|
data = []
|
|
|
|
|
file = open(file_name, 'r', encoding='utf-8') # 打开文件
|
|
|
|
|
file_data = file.readlines() # 读取所有行
|
|
|
|
|
for row in file_data:
|
|
|
|
|
tmp_list = row.split(' ') # 按‘,’切分每行的数据
|
|
|
|
|
tmp_list[-1] = tmp_list[-1].replace('\n', '') # 去掉换行符
|
|
|
|
|
data.append(tmp_list) # 将每行数据插入data中
|
|
|
|
|
file.close()
|
|
|
|
|
return data
|
|
|
|
|
|
|
|
|
|
def updateFile(file, old_str, new_str):
|
|
|
|
|
with open(file, "r", encoding="utf-8") as f1, open("%s.bak" % file, "w", encoding="utf-8") as f2:
|
|
|
|
|
for line in f1:
|
|
|
|
|
f2.write(re.sub(old_str, new_str, line))
|
|
|
|
|
os.remove(file)
|
|
|
|
|
os.rename("%s.bak" % file, file)
|
|
|
|
|
f2.close()
|
|
|
|
|
f1.close()
|
|
|
|
|
|
|
|
|
|
updateFile(r"test.txt", ";", "。")
|
|
|
|
|
updateFile(r"test.txt", ":", ":")
|
|
|
|
|
updateFile(r"test.txt", "-", "--")
|
|
|
|
|
updateFile(r"test.txt", "确诊病例", "")
|
|
|
|
|
updateFile(r"test.txt", "病例轨迹", "")
|
|
|
|
|
updateFile(r"test.txt", "病例", "&&病例")
|
|
|
|
|
updateFile(r"test.txt", "呼和浩特市应对新型冠状病毒感染", "end")
|
|
|
|
|
data = openreadtxt('test.txt')
|
|
|
|
|
data = str(data)
|
|
|
|
|
result = re.findall("病例\d:.*?(?=&&|end)", data)
|
|
|
|
|
name = []
|
|
|
|
|
date = []
|
|
|
|
|
time = []
|
|
|
|
|
through = []
|
|
|
|
|
Num = 0
|
|
|
|
|
for i in result:
|
|
|
|
|
NUM = 0
|
|
|
|
|
f = open('20.txt', 'w', encoding="utf-8")
|
|
|
|
|
f.write(i)
|
|
|
|
|
f.close()
|
|
|
|
|
data = openreadtxt("20.txt")
|
|
|
|
|
data = str(data)
|
|
|
|
|
name1 = re.findall("病例\d", data)
|
|
|
|
|
getOne = re.findall("\d{1,2}月\d{1,2}?日.*?。+(?=\d{1,2}月\d{1,2}日)|\d{1,2}月\d{1,2}?日.*?。+(?=病例)", data)
|
|
|
|
|
numname = 0
|
|
|
|
|
for i in getOne:
|
|
|
|
|
NUM += 1
|
|
|
|
|
f = open('10.txt', 'w', encoding="utf-8")
|
|
|
|
|
f.write(i)
|
|
|
|
|
f.close()
|
|
|
|
|
data = openreadtxt("10.txt")
|
|
|
|
|
data = str(data)
|
|
|
|
|
date1 = re.findall("\d{1,2}月\d{1,2}日+(?=,)", data)[0]
|
|
|
|
|
date1 = date1.split()
|
|
|
|
|
time += re.findall("\d{1,2}:\d{1,2}--\d{1,2}:\d{1,2}|\d{1,2}:\d{1,2}", data)
|
|
|
|
|
updateFile(r"10.txt", "[0-9月\'\",:-]", "")
|
|
|
|
|
updateFile(r"10.txt", "日,", "。")
|
|
|
|
|
data = openreadtxt("10.txt")
|
|
|
|
|
data = str(data)
|
|
|
|
|
t = re.findall("(?<=。)+.*?。", data)
|
|
|
|
|
through += t
|
|
|
|
|
numdate = len(t)
|
|
|
|
|
Num += numdate
|
|
|
|
|
numname += numdate
|
|
|
|
|
x = 0
|
|
|
|
|
while x < numdate:
|
|
|
|
|
date += date1
|
|
|
|
|
x += 1
|
|
|
|
|
i = 0
|
|
|
|
|
name2 = name1[0].split()
|
|
|
|
|
while i < numname:
|
|
|
|
|
name += name2
|
|
|
|
|
i += 1
|
|
|
|
|
num = range(0, Num)
|
|
|
|
|
rows = zip(num, name, date, date, time, through)
|
|
|
|
|
with open("test.csv", "w", encoding='utf-8', newline="") as t:
|
|
|
|
|
writer = csv.writer(t)
|
|
|
|
|
writer.writerow(["", "病例", "开始日期", "结束日期", "时间", "事件"])
|
|
|
|
|
for row in rows:
|
|
|
|
|
writer.writerow(row)
|
|
|
|
|
t.close()
|