You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

4.3 KiB

educoder

#爬虫作为获取网络信息的重要手段,应该是每个计算机学习者的必备技能。我们小组在跟随头歌教程了解了爬虫的基本原理和流程之后,完成了课内的任务,并进行了简单的课外任务选取和实施。 #另外笔者组长在CSDN社区了解了一些关于爬虫的相关文章之后认识到爬虫是一项强大而有广泛应用场景的计算机技术真正技术高超的爬虫工程师可以完成对许多难以获取的信息的爬取短期的学习只能浅入这项技术的表层实现对结构简单的网页的爬取。故本小组将在实操方面只实现对简单网页的文本爬取。 import requests url = "https://www.bitpush.news/covid19/" headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36" } response = requests.get(url, headers=headers) html = response.text from lxml import etree parse = etree.HTMLParser(encoding='utf-8') doc = etree.HTML(html) country = doc.xpath('//div[@class="table_container"]//tbody/tr/td/span/text()')

person = doc.xpath('//div[@class="table_container"]//tbody/tr/td[2]/text()')

person = [x.replace(",", "") for x in person]

death = doc.xpath('//div[@class="table_container"]//tbody/tr/td[3]/text()')

death = [x.replace(",", "") for x in death] message = list(zip(country, person, death)) import csv with open("全球以及美国各州疫情数据爬取.csv", "w") as f: w = csv.writer(f) w.writerows(message) import pandas as pd

df = pd.read_csv("C://Users/86184/Desktop/Computer Desk/期末用/全球疫情数据爬取.csv",names=['area','person','death'],engine= "python",encoding='gbk') import matplotlib.pyplot as plt plt.rcParams['font.sans-serif'] = ['SimHei'] plt.rcParams['figure.figsize'] = (100,50) x = df["area"].values y = df["death"].values plt.bar(x, y) plt.xlabel("国家/地区",fontsize=14) plt.ylabel("死亡人数",fontsize=14) plt.show() #在上述代码中我们本质上实现了对网页html特定并列标签下内容的获取但在一个复杂的网页html语言中寻找特定标签本就是一件繁琐的事情所以在下一段代码中我们利用列表实现了对并列标签下部分内容的截取即获取美国各州疫情数据的获取。 import requests url = "https://www.bitpush.news/covid19/" headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36"} response = requests.get(url, headers=headers) html = response.text from lxml import etree parse = etree.HTMLParser(encoding='utf-8') doc = etree.HTML(html) area = doc.xpath('//div[@class="table_container"]//tbody/tr/td/span/text()') person = doc.xpath('//div[@class="table_container"]//tbody/tr/td[2]/text()') person = [x.replace(",", "") for x in person] death = doc.xpath('//div[@class="table_container"]//tbody/tr/td[3]/text()') death = [x.replace(",", "") for x in death] message = list(zip(area, person, death)) import csv with open("美国数据.csv", "w") as f: w = csv.writer(f) w.writerows(message[-58:]) import pandas as pd

df = pd.read_csv("C://Users/86184/Desktop/Computer Desk/期末用/美国数据.csv",names=['area','person','death'],engine= "python",encoding='gbk') import matplotlib.pyplot as plt plt.rcParams['font.sans-serif'] = ['SimHei'] plt.rcParams['figure.figsize'] = (100,50) x = df["area"].values y = df["death"].values plt.bar(x, y) plt.xlabel("州",fontsize=14) plt.ylabel("死亡人数",fontsize=14) plt.show() #最后,我们利用爬虫完成了对于菜鸟网站上列表的教程信息获取。 import requests url = "https://www.runoob.com/python3/python3-list.html" headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36" } response = requests.get(url, headers=headers) html = response.text from lxml import etree parse = etree.HTMLParser(encoding='utf-8') doc = etree.HTML(html) tx1=doc.xpath('/html/body/div[4]/div/div[2]/div/div[3]/div/h1/text()') tx2=doc.xpath('/html/body/div[4]/div/div[2]/div/div[3]/div/p/text()') tx3=[] for i in range(len(tx2)): if i!=len(tx2)-1: tx3.append(tx2[i]) tx3.append('\n')

with open('菜鸟网页列表爬取.txt','w',encoding='utf-8') as f: f.writelines(tx3) #另外,数据可视化图片附文档内