update 2023/10/4/15:27

2 years ago · 12bddb46da
parent 59ce5b740a
commit 12bddb46da
14 changed files with 1526 additions and 0 deletions
--- a/src/medicine/cancer_information.sql
+++ b/src/medicine/cancer_information.sql
--- a/src/medicine/癌症数据获取/Get_Cancer_Information/.idea/.gitignore
+++ b/src/medicine/癌症数据获取/Get_Cancer_Information/.idea/.gitignore
@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
--- a/src/medicine/癌症数据获取/Get_Cancer_Information/.idea/Get_Cancer_Information.iml
+++ b/src/medicine/癌症数据获取/Get_Cancer_Information/.idea/Get_Cancer_Information.iml
@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>
--- a/src/medicine/癌症数据获取/Get_Cancer_Information/.idea/inspectionProfiles/Project_Default.xml
+++ b/src/medicine/癌症数据获取/Get_Cancer_Information/.idea/inspectionProfiles/Project_Default.xml
@ -0,0 +1,14 @@
+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+    <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
+      <option name="ignoredPackages">
+        <value>
+          <list size="1">
+            <item index="0" class="java.lang.String" itemvalue="pillow" />
+          </list>
+        </value>
+      </option>
+    </inspection_tool>
+  </profile>
+</component>
--- a/src/medicine/癌症数据获取/Get_Cancer_Information/.idea/inspectionProfiles/profiles_settings.xml
+++ b/src/medicine/癌症数据获取/Get_Cancer_Information/.idea/inspectionProfiles/profiles_settings.xml
@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>
--- a/src/medicine/癌症数据获取/Get_Cancer_Information/.idea/misc.xml
+++ b/src/medicine/癌症数据获取/Get_Cancer_Information/.idea/misc.xml
@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="D:\Python 3.11\python.exe" project-jdk-type="Python SDK" />
+</project>
--- a/src/medicine/癌症数据获取/Get_Cancer_Information/.idea/modules.xml
+++ b/src/medicine/癌症数据获取/Get_Cancer_Information/.idea/modules.xml
@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/Get_Cancer_Information.iml" filepath="$PROJECT_DIR$/.idea/Get_Cancer_Information.iml" />
+    </modules>
+  </component>
+</project>
--- a/src/medicine/癌症数据获取/Get_Cancer_Information/spider/Get_Information.py
+++ b/src/medicine/癌症数据获取/Get_Cancer_Information/spider/Get_Information.py
@ -0,0 +1,61 @@
+from html.parser import HTMLParser
+import requests
+from lxml import etree
+import re
+# xpath url
+def Anaxpath(url , user , savepath):
+    path = "//*[@id=\"PageContent_T0643CD2A003_Col00\"]/article"
+# url = "https://www.who.int/zh/news-room/fact-sheets/detail/breast-cancer"
+# ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.47"
+    with requests.request('GET' , url , headers = {'User-agent':user}) as f:
+        content = f.text          #获取HTML的内容
+        html = etree.HTML(content)  #分析HTML，返回DOM根节点
+        res = html.xpath(path)
+        tree = etree.tostring(res[0],encoding='utf-8').decode('utf-8')
+        with open(savepath , 'w' , encoding="utf-8") as f1:
+            for j in tree:
+                f1.write(j)
+            f1.close()
+        f.close()
+
+def Analyse(htmlpath):
+    with open(htmlpath, 'r', encoding='utf-8') as f:
+        s = ""
+        for i in f:
+            s += i
+
+        # 解析h2标题
+        h2 = re.compile('<h2>(.*?)</h2>')
+        res_h2 = re.findall(h2, s)
+        res_h2.pop()
+
+        p = re.compile('<p>(.*?)<h2>')
+        res_p = re.findall(p, s)
+        j = 0
+        res = []
+
+        for i in res_h2:
+            l = [i]
+            # 需要重新解析
+            if "重要事实" in i:
+                x = re.compile('<ul>(.*?)</ul>')
+                res_x = re.findall(x, s)
+                cnt = 1
+                y = re.compile("<strong>(.*?)</strong>")
+                res_y = re.findall(y, res_x[0])
+                while True:
+                    if cnt - 1 >= len(res_y):
+                        break
+                    # l.append(str(cnt) + "、 " + res_y[cnt - 1])
+                    l.append(" " + res_y[cnt - 1])
+                    cnt += 1
+            else:
+                res_p[j] = "<p>" + res_p[j]
+                z = re.compile("<p>(.*?)</p>")
+                res_z = re.findall(z, res_p[j])
+                for w in range(1, len(res_z) + 1):
+                    # l.append(str(w) + "、 " + res_z[w - 1])
+                    l.append(" " + res_z[w - 1])
+                j += 1
+            res.append(l)
+        return res
--- a/src/medicine/癌症数据获取/Get_Cancer_Information/spider/pycache/Get_Information.cpython-311.pyc
+++ b/src/medicine/癌症数据获取/Get_Cancer_Information/spider/pycache/Get_Information.cpython-311.pyc
--- a/src/medicine/癌症数据获取/Get_Cancer_Information/spider/use_code.ipynb
+++ b/src/medicine/癌症数据获取/Get_Cancer_Information/spider/use_code.ipynb
--- a/src/medicine/癌症数据获取/Get_Cancer_Information/sql/breast-cancer.txt
+++ b/src/medicine/癌症数据获取/Get_Cancer_Information/sql/breast-cancer.txt
--- a/src/medicine/癌症数据获取/Get_Cancer_Information/sql/colorectal-cancer.txt
+++ b/src/medicine/癌症数据获取/Get_Cancer_Information/sql/colorectal-cancer.txt
--- a/src/medicine/癌症数据获取/Get_Cancer_Information/sql/lung-cancer.txt
+++ b/src/medicine/癌症数据获取/Get_Cancer_Information/sql/lung-cancer.txt
--- a/src/medicine/癌症数据获取/Get_Cancer_Information/sql/savesql.py
+++ b/src/medicine/癌症数据获取/Get_Cancer_Information/sql/savesql.py
@ -0,0 +1,66 @@
+from spider.Get_Information import Anaxpath as ap , Analyse as a
+import pymysql
+conn = pymysql.connect(
+    host="127.0.0.1",
+    port=3306,  # 端口号
+    user="root",  # 数据库用户
+    password="123456",  # 数据库密码
+    database="db1"  # 要连接的数据库名称
+)
+cursor = conn.cursor()
+
+# 肺癌、乳腺癌、结直肠癌
+l = ["lung-cancer" , "breast-cancer" , "colorectal-cancer"]
+d = {"lung-cancer" : "肺癌" , "breast-cancer" : "乳腺癌" , "colorectal-cancer" : "结直肠癌"}
+user = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.47"
+for i in l:
+    url = "https://www.who.int/zh/news-room/fact-sheets/detail/" + i
+    savepath = i + ".txt"
+    ap(url , user , savepath)
+    res = a(savepath)
+    # print(res)
+
+    # ['重要事实', '概述', '问题的范围', '谁有风险？', '症状和体征', '治疗', '全球影响']
+    # ['重要事实', '概述', '风险因素', '症状', '预防', '诊断', '治疗和护理', '临床试验']
+    # ['重要事实', '概述', '症状', '预防', '诊断', '治疗和护理', '护理阶段', '临床试验'] lung
+    # ['重要事实', '概述', '症状','风险因素','诊断', '治疗和护理']
+
+    for j in range(len(res)):
+        for h in range(len(res[j])):
+            res[j][h] = res[j][h].replace("<strong>a)" , '')
+            res[j][h] = res[j][h].replace("<strong/>" , '')
+            res[j][h] = res[j][h].replace("<strong>", '')
+            res[j][h] = res[j][h].replace("</strong>", '')
+            res[j][h] = res[j][h].replace("%", '%%')
+    if i == "lung-cancer":
+        res[0][0] = res[0][0][:res[0][0].index("：")]
+        res[3][0] = "风险因素"
+        res.pop();res.pop()
+    elif i == "breast-cancer":
+        res.pop(2)
+        res[2][0] = "风险因素"
+        res[2] , res[3] = res[3] , res[2]
+        res[2][0] = "症状"
+        res.pop()
+        res[-1][0] = "治疗和护理"
+        res.insert(4 , ["诊断" , " "])
+    else:
+        res[2] , res[3] = res[3] , res[2]
+        res.pop(4)
+        res.pop()
+
+    save = []
+    for j in res:
+        s = ''
+        for h in range(1 , len(j)):
+            s += j[h]
+        save.append([j[0] , s])
+
+    t = d[i]
+    for j in range(len(save)):
+        t += "," + save[j][1]
+    t = tuple(t.split(","))
+    sql_insert = "insert into cancer_information (`癌症名称` , `重要事实` , `概述` , `症状` , `风险因素` , `诊断` , `治疗和护理`) values " + str(t)
+    print(sql_insert)
+    cursor.execute(sql_insert)
+    conn.commit()