parent
d28a0103e2
commit
7463dc6adb
@ -0,0 +1 @@
|
|||||||
|
#### 代码窗口
#1.数据爬取
import requests
import re
import csv
#构建地址
url = "https://top.chinaz.com/gongsi/index_zhuce.html"
# 设置请求头信息
# 设置请求头信息
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.42"
}
# 使用reqeusts模快发起 GET 请求
response = requests.get(url, headers=headers)
# 获取请求的返回结果
html = response.text
# 存储内容
message = []
# 总共903个页面的数据
for page in range(903):
# 组装url
if page == 0:
url = "https://top.chinaz.com/gongsi/index_zhuce.html"
else:
url = "https://top.chinaz.com/gongsi/index_zhuce_{}.html".format(page + 1)
# 使用reqeusts模快发起 GET 请求
response = requests.get(url, headers=headers)
html = response.text
# 使用 findall 函数来获取数据
# 找公司名
company = re.findall('target="_blank">(.*?)</a></h3>\r\n', html)
# 找注册资金
money = re.findall('<div class="CoDate"><em>(.*?)</em>注册资本</div>', html)
pageOne = list(zip(company,money))
# 合并列表
message.extend(pageOne)
#保存文件
with open("contents.csv", "w") as f:
w = csv.writer(f)
w.writerows(message)
!cat contents.csv
#绘图
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
# 读取数据
df = pd.read_csv("contents.csv", names=["company", "money"],encoding='utf-8')
# 用黑体显示中文
plt.rcParams['font.sans-serif'] = ['SimHei']
#获取所需数据
j=df.values[0:20]
companys=[i for i,k in j]
moneyss=[k for i,k in j]
moneys=[float(k[:-1]) for i,k in j]
#设置柱状图长宽
plt.figure(figsize=(20,10))
#画柱状图
plt.bar(companys,moneys)
#标签
plt.xticks(range(len(companys)),companys,rotation=90)
#标注资金
for x,y in zip(range(len(moneys)),moneys):
plt.text(x,y,y,ha='center',va='bottom')
#x,y轴标注和图标题
plt.title("注册资金最多的公司 top20")
plt.xlabel("公司")
plt.ylabel("注册资金(亿元)")
plt.show
|
Loading…
Reference in new issue