You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

81 lines
2.8 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

# 1
import re
import requests
url = "https://top.chinaz.com/gongsi/index_zhuce.html"
# 设置请求头信息
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36"
}
#requests爬取网页内容
# 使用reqeusts模快发起 GET 请求
response = requests.get(url, headers=headers)
# 获取请求的返回结果
html = response.text
# 存储内容
message = []
# 总共17个页面的数据
for page in range(17):
# 组装url
if page == 0:
url = "https://top.chinaz.com/gongsi/index_zhuce.html"
else:
url = "https://top.chinaz.com/gongsi/index_zhuce.html".format(page + 1)
response = requests.get(url, headers=headers)
html = response.text
# 使用 findall 函数来获取数据
# 公司名
company = re.findall('<a.*?target="_blank">(.+?)</a></h3>', html)
# 注册资本
capital= re.findall('注册资本:</span>(.*?)</p>', html)
# 合并列表
pageOne = list(zip(company,debt ))
message.extend(pageOne) #将列表放入message里
import csv
with open("content.csv", "w") as f:
w = csv.writer(f)
w.writerows(message)#将一个二维列表中的每一个列表写为一行
import pandas as pd
# 读取数据
df = pd.read_csv("content.csv", names=["company", "capital"])
df4=df.head(500)#获取注册资金500强公司的名字和注册资金
print(df4)
df2=df.head(20)
print(df2)#获取top20公司的名字和注册资金
df3=list(df2['capital'])
#将单位统一为(万)
for i in range(len(df3)):
if ('万' in df3[0]) :
a=df3[0].replace('万人民币','')
a=a.replace('万元人民币','')
a=a.replace('万元','')
a=a.replace('万','')
if ('亿'in df3[0]):
a=df3[0].replace('亿元','')
a=a.replace('亿人民币','')
a=float(a)*10000
a=str(a)
df3.remove(df3[0])
df3.append(a)
# 绘制条形图(饼图字迹会重合,所以不用饼图)
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']# 用黑体显示中文
plt.rcParams['axes.unicode_minus'] = False # (解决坐标轴负数的负号显示问题)
plt.grid()#网格
b = df2['company']#获取top20公司名称
sizes=[]
for i in df3:
i=float(i)#将列表中字符串类型转化为浮点类型
sizes.append(i)#获取t0p20公司所对应的资金
for i,j in zip(range(20),sizes):
plt.text(i,j+0.5,j,ha='center',va='bottom',rotation=90) #给每个公司标上具体资金
plt.bar(range(20),sizes,width=0.8)
plt.xticks(range(20), b,rotation=90)#rotation=90将横坐标的字变成竖直的
plt.xlabel('公司名称')#设置横标签
plt.ylabel('单位:万元')#设置纵标签
plt.title('注册资金最多的公司 top20 ')#设置标题
plt.figure(figsize=(30, 20))#改变图的宽、高
plt.show()