|
|
# educoder
|
|
|
#爬虫作为获取网络信息的重要手段,应该是每个计算机学习者的必备技能。我们小组在跟随头歌教程了解了爬虫的基本原理和流程之后,完成了课内的任务,并进行了简单的课外任务选取和实施。
|
|
|
#另外,笔者(组长)在CSDN社区了解了一些关于爬虫的相关文章之后,认识到,爬虫是一项强大而有广泛应用场景的计算机技术,真正技术高超的爬虫工程师可以完成对许多难以获取的信息的爬取,短期的学习,只能浅入这项技术的表层,实现对结构简单的网页的爬取。故本小组将在实操方面只实现对简单网页的文本爬取。
|
|
|
import requests
|
|
|
url = "https://www.bitpush.news/covid19/"
|
|
|
headers = {
|
|
|
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36"
|
|
|
}
|
|
|
response = requests.get(url, headers=headers)
|
|
|
html = response.text
|
|
|
from lxml import etree
|
|
|
parse = etree.HTMLParser(encoding='utf-8')
|
|
|
doc = etree.HTML(html)
|
|
|
country = doc.xpath('//div[@class="table_container"]//tbody/tr/td/span/text()')
|
|
|
|
|
|
person = doc.xpath('//div[@class="table_container"]//tbody/tr/td[2]/text()')
|
|
|
|
|
|
person = [x.replace(",", "") for x in person]
|
|
|
|
|
|
death = doc.xpath('//div[@class="table_container"]//tbody/tr/td[3]/text()')
|
|
|
|
|
|
death = [x.replace(",", "") for x in death]
|
|
|
message = list(zip(country, person, death))
|
|
|
import csv
|
|
|
with open("全球以及美国各州疫情数据爬取.csv", "w") as f:
|
|
|
w = csv.writer(f)
|
|
|
w.writerows(message)
|
|
|
import pandas as pd
|
|
|
|
|
|
df = pd.read_csv("C://Users/86184/Desktop/Computer Desk/期末用/全球疫情数据爬取.csv",names=['area','person','death'],engine= "python",encoding='gbk')
|
|
|
import matplotlib.pyplot as plt
|
|
|
plt.rcParams['font.sans-serif'] = ['SimHei']
|
|
|
plt.rcParams['figure.figsize'] = (100,50)
|
|
|
x = df["area"].values
|
|
|
y = df["death"].values
|
|
|
plt.bar(x, y)
|
|
|
plt.xlabel("国家/地区",fontsize=14)
|
|
|
plt.ylabel("死亡人数",fontsize=14)
|
|
|
plt.show()
|
|
|
#在上述代码中,我们本质上实现了对网页html特定并列标签下内容的获取,但在一个复杂的网页html语言中寻找特定标签本就是一件繁琐的事情,所以在下一段代码中,我们利用列表实现了对并列标签下部分内容的截取,即获取美国各州疫情数据的获取。
|
|
|
import requests
|
|
|
url = "https://www.bitpush.news/covid19/"
|
|
|
headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36"}
|
|
|
response = requests.get(url, headers=headers)
|
|
|
html = response.text
|
|
|
from lxml import etree
|
|
|
parse = etree.HTMLParser(encoding='utf-8')
|
|
|
doc = etree.HTML(html)
|
|
|
area = doc.xpath('//div[@class="table_container"]//tbody/tr/td/span/text()')
|
|
|
person = doc.xpath('//div[@class="table_container"]//tbody/tr/td[2]/text()')
|
|
|
person = [x.replace(",", "") for x in person]
|
|
|
death = doc.xpath('//div[@class="table_container"]//tbody/tr/td[3]/text()')
|
|
|
death = [x.replace(",", "") for x in death]
|
|
|
message = list(zip(area, person, death))
|
|
|
import csv
|
|
|
with open("美国数据.csv", "w") as f:
|
|
|
w = csv.writer(f)
|
|
|
w.writerows(message[-58:])
|
|
|
import pandas as pd
|
|
|
|
|
|
df = pd.read_csv("C://Users/86184/Desktop/Computer Desk/期末用/美国数据.csv",names=['area','person','death'],engine= "python",encoding='gbk')
|
|
|
import matplotlib.pyplot as plt
|
|
|
plt.rcParams['font.sans-serif'] = ['SimHei']
|
|
|
plt.rcParams['figure.figsize'] = (100,50)
|
|
|
x = df["area"].values
|
|
|
y = df["death"].values
|
|
|
plt.bar(x, y)
|
|
|
plt.xlabel("州",fontsize=14)
|
|
|
plt.ylabel("死亡人数",fontsize=14)
|
|
|
plt.show()
|
|
|
#最后,我们利用爬虫完成了对于菜鸟网站上列表的教程信息获取。
|
|
|
import requests
|
|
|
url = "https://www.runoob.com/python3/python3-list.html"
|
|
|
headers = {
|
|
|
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36"
|
|
|
}
|
|
|
response = requests.get(url, headers=headers)
|
|
|
html = response.text
|
|
|
from lxml import etree
|
|
|
parse = etree.HTMLParser(encoding='utf-8')
|
|
|
doc = etree.HTML(html)
|
|
|
tx1=doc.xpath('/html/body/div[4]/div/div[2]/div/div[3]/div/h1/text()')
|
|
|
tx2=doc.xpath('/html/body/div[4]/div/div[2]/div/div[3]/div/p/text()')
|
|
|
tx3=[]
|
|
|
for i in range(len(tx2)):
|
|
|
if i!=len(tx2)-1:
|
|
|
tx3.append(tx2[i])
|
|
|
tx3.append('\n')
|
|
|
|
|
|
with open('菜鸟网页列表爬取.txt','w',encoding='utf-8') as f:
|
|
|
f.writelines(tx3)
|
|
|
#另外,数据可视化图片附文档内 |