diff --git a/Untitled17.ipynb b/Untitled17.ipynb new file mode 100644 index 0000000..49c0579 --- /dev/null +++ b/Untitled17.ipynb @@ -0,0 +1,126 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "63b3eaa5", + "metadata": {}, + "outputs": [], + "source": [ + "import csv\n", + "import requests\n", + "import re, os\n", + "from bs4 import BeautifulSoup\n", + "\n", + "url = ' https://mp.weixin.qq.com/s/K0u_qPFQtWuH4hk5K2xWfQ'\n", + "response = requests.get(url)\n", + "response.encoding = response.apparent_encoding\n", + "response.encoding = 'utf-8'\n", + "html = response.text\n", + "soup = BeautifulSoup(html, 'html.parser')\n", + "ans = soup.select('div.rich_media > div.rich_media_inner ')\n", + "ans1 = ans[0].text.encode()\n", + "\n", + "def openreadtxt(file_name):\n", + " data = []\n", + " file = open(file_name, 'r', encoding='utf-8') \n", + " file_data = file.readlines() \n", + " for row in file_data:\n", + " tmp_list = row.split(' ')\n", + " tmp_list[-1] = tmp_list[-1].replace('\\n', '')\n", + " data.append(tmp_list)\n", + " file.close()\n", + " return data\n", + "\n", + " def updateFile(file, old_str, new_str):\n", + " with open(file, \"r\", encoding=\"utf-8\") as f1, open(\"%s.bak\" % file, \"w\", encoding=\"utf-8\") as f2:\n", + " for line in f1:\n", + " f2.write(re.sub(old_str, new_str, line))\n", + " os.remove(file)\n", + " os.rename(\"%s.bak\" % file, file)\n", + " f2.close()\n", + " f1.close()\n", + "\n", + "updateFile(r\"test.txt\", \";\", \"。\")\n", + "updateFile(r\"test.txt\", \":\", \":\")\n", + "updateFile(r\"test.txt\", \"-\", \"--\")\n", + "updateFile(r\"test.txt\", \"确诊病例\", \"\")\n", + "updateFile(r\"test.txt\", \"病例轨迹\", \"\")\n", + "updateFile(r\"test.txt\", \"病例\", \"&&病例\")\n", + "updateFile(r\"test.txt\", \"呼和浩特市应对新型冠状病毒感染\", \"end\")\n", + "data = openreadtxt('test.txt')\n", + "data = str(data)\n", + "result = re.findall(\"病例\\d:.*?(?=&&|end)\", data)\n", + "name = []\n", + "date = []\n", + "time = []\n", + "through = []\n", + "Num = 0\n", + "for i in result:\n", + " NUM = 0\n", + " f = open('20.txt', 'w', encoding=\"utf-8\")\n", + " f.write(i)\n", + " f.close()\n", + " data = openreadtxt(\"20.txt\")\n", + " data = str(data)\n", + " name1 = re.findall(\"病例\\d\", data)\n", + " getOne = re.findall(\"\\d{1,2}月\\d{1,2}?日.*?。+(?=\\d{1,2}月\\d{1,2}日)|\\d{1,2}月\\d{1,2}?日.*?。+(?=病例)\", data)\n", + " numname = 0\n", + " for i in getOne:\n", + " NUM += 1\n", + " f = open('10.txt', 'w', encoding=\"utf-8\")\n", + " f.write(i)\n", + " f.close()\n", + " data = openreadtxt(\"10.txt\")\n", + " data = str(data)\n", + " date1 = re.findall(\"\\d{1,2}月\\d{1,2}日+(?=,)\", data)[0]\n", + " date1 = date1.split()\n", + " time += re.findall(\"\\d{1,2}:\\d{1,2}--\\d{1,2}:\\d{1,2}|\\d{1,2}:\\d{1,2}\", data)\n", + " updateFile(r\"10.txt\", \"[0-9月\\'\\\",:-]\", \"\")\n", + " updateFile(r\"10.txt\", \"日,\", \"。\")\n", + " data = openreadtxt(\"10.txt\")\n", + " data = str(data)\n", + " t = re.findall(\"(?<=。)+.*?。\", data)\n", + " through += t\n", + " numdate = len(t)\n", + " Num += numdate\n", + " numname += numdate\n", + " x = 0\n", + " while x < numdate:\n", + " date += date1\n", + " x += 1\n", + " i = 0\n", + " name2 = name1[0].split()\n", + " while i < numname:\n", + " name += name2\n", + " i += 1\n", + "num = range(0, Num)\n", + "rows = zip(num, name, date, date, time, through)\n", + "with open(\"test.csv\", \"w\", encoding='utf-8', newline=\"\") as t:\n", + " writer = csv.writer(t)\n", + " writer.writerow([\"\", \"病例\", \"开始日期\", \"结束日期\", \"时间\", \"事件\"])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}