update 2023/10/4/15:27

master
薛皓天 2 years ago
parent 59ce5b740a
commit 12bddb46da

File diff suppressed because one or more lines are too long

@ -0,0 +1,8 @@
# Default ignored files
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

@ -0,0 +1,14 @@
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
<option name="ignoredPackages">
<value>
<list size="1">
<item index="0" class="java.lang.String" itemvalue="pillow" />
</list>
</value>
</option>
</inspection_tool>
</profile>
</component>

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="D:\Python 3.11\python.exe" project-jdk-type="Python SDK" />
</project>

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/Get_Cancer_Information.iml" filepath="$PROJECT_DIR$/.idea/Get_Cancer_Information.iml" />
</modules>
</component>
</project>

@ -0,0 +1,61 @@
from html.parser import HTMLParser
import requests
from lxml import etree
import re
# xpath url
def Anaxpath(url , user , savepath):
path = "//*[@id=\"PageContent_T0643CD2A003_Col00\"]/article"
# url = "https://www.who.int/zh/news-room/fact-sheets/detail/breast-cancer"
# ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.47"
with requests.request('GET' , url , headers = {'User-agent':user}) as f:
content = f.text #获取HTML的内容
html = etree.HTML(content) #分析HTML返回DOM根节点
res = html.xpath(path)
tree = etree.tostring(res[0],encoding='utf-8').decode('utf-8')
with open(savepath , 'w' , encoding="utf-8") as f1:
for j in tree:
f1.write(j)
f1.close()
f.close()
def Analyse(htmlpath):
with open(htmlpath, 'r', encoding='utf-8') as f:
s = ""
for i in f:
s += i
# 解析h2标题
h2 = re.compile('<h2>(.*?)</h2>')
res_h2 = re.findall(h2, s)
res_h2.pop()
p = re.compile('<p>(.*?)<h2>')
res_p = re.findall(p, s)
j = 0
res = []
for i in res_h2:
l = [i]
# 需要重新解析
if "重要事实" in i:
x = re.compile('<ul>(.*?)</ul>')
res_x = re.findall(x, s)
cnt = 1
y = re.compile("<strong>(.*?)</strong>")
res_y = re.findall(y, res_x[0])
while True:
if cnt - 1 >= len(res_y):
break
# l.append(str(cnt) + "、 " + res_y[cnt - 1])
l.append(" " + res_y[cnt - 1])
cnt += 1
else:
res_p[j] = "<p>" + res_p[j]
z = re.compile("<p>(.*?)</p>")
res_z = re.findall(z, res_p[j])
for w in range(1, len(res_z) + 1):
# l.append(str(w) + "、 " + res_z[w - 1])
l.append(" " + res_z[w - 1])
j += 1
res.append(l)
return res

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

@ -0,0 +1,66 @@
from spider.Get_Information import Anaxpath as ap , Analyse as a
import pymysql
conn = pymysql.connect(
host="127.0.0.1",
port=3306, # 端口号
user="root", # 数据库用户
password="123456", # 数据库密码
database="db1" # 要连接的数据库名称
)
cursor = conn.cursor()
# 肺癌、乳腺癌、结直肠癌
l = ["lung-cancer" , "breast-cancer" , "colorectal-cancer"]
d = {"lung-cancer" : "肺癌" , "breast-cancer" : "乳腺癌" , "colorectal-cancer" : "结直肠癌"}
user = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.47"
for i in l:
url = "https://www.who.int/zh/news-room/fact-sheets/detail/" + i
savepath = i + ".txt"
ap(url , user , savepath)
res = a(savepath)
# print(res)
# ['重要事实', '概述', '问题的范围', '谁有风险?', '症状和体征', '治疗', '全球影响']
# ['重要事实', '概述', '风险因素', '症状', '预防', '诊断', '治疗和护理', '临床试验']
# ['重要事实', '概述', '症状', '预防', '诊断', '治疗和护理', '护理阶段', '临床试验'] lung
# ['重要事实', '概述', '症状','风险因素','诊断', '治疗和护理']
for j in range(len(res)):
for h in range(len(res[j])):
res[j][h] = res[j][h].replace("<strong>a)" , '')
res[j][h] = res[j][h].replace("<strong/>" , '')
res[j][h] = res[j][h].replace("<strong>", '')
res[j][h] = res[j][h].replace("</strong>", '')
res[j][h] = res[j][h].replace("%", '%%')
if i == "lung-cancer":
res[0][0] = res[0][0][:res[0][0].index("")]
res[3][0] = "风险因素"
res.pop();res.pop()
elif i == "breast-cancer":
res.pop(2)
res[2][0] = "风险因素"
res[2] , res[3] = res[3] , res[2]
res[2][0] = "症状"
res.pop()
res[-1][0] = "治疗和护理"
res.insert(4 , ["诊断" , " "])
else:
res[2] , res[3] = res[3] , res[2]
res.pop(4)
res.pop()
save = []
for j in res:
s = ''
for h in range(1 , len(j)):
s += j[h]
save.append([j[0] , s])
t = d[i]
for j in range(len(save)):
t += "," + save[j][1]
t = tuple(t.split(","))
sql_insert = "insert into cancer_information (`癌症名称` , `重要事实` , `概述` , `症状` , `风险因素` , `诊断` , `治疗和护理`) values " + str(t)
print(sql_insert)
cursor.execute(sql_insert)
conn.commit()
Loading…
Cancel
Save