You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
China_top500_company/陈乐鑫--python中国500强公司信息爬取代码.py

96 lines
2.3 KiB

# -*- coding: utf-8 -*-
"""
Created on Tue May 31 13:44:13 2022
@author: 86136
"""
import requests
import re
#设置headers
headers={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36 Edg/101.0.1210.53'}
# 存储内容
message = []
#获取17个页面的公司数据共510个公司
for page in range(17):
# 组装url
if page == 0:
url = "https://top.chinaz.com/gongsi/index_zhuce.html"
else:
url = "https://top.chinaz.com/gongsi/index_zhuce_{}.html".format(page + 1)
# 使用reqeusts模快发起 GET 请求
response = requests.get(url, headers=headers)
html = response.text
# 使用 findall 函数来获取数据
# 公司名
company = re.findall('<a.*?target="_blank">(.+?)</a></h3>', html)
money = re.findall('注册资本:</span>(.*?)</p>', html)
pageOne = list(zip(company,money))
# 合并列表
message.extend(pageOne)
#取前500强的公司数据
message=message[0:500]
#数据写入
# 导入python中的内置模块csv
import csv
with open("content.csv", "w") as f:
w = csv.writer(f)
w.writerows(message)
#数据可视化
%matplotlib inline
import pandas as pd
# 读取数据
df = pd.read_csv("content.csv", names=["company",'money'],encoding='gbk')
lmoney=list(df['money'])[:20]
lcompany=list(df['company'])[:20]
#单位统一换算
tmoney=[]
for i in lmoney:
p=''
j=re.findall(r'\d+\.\d+',i)
if j==[]:
j=re.findall(r'\d+',i)
for z in j:
if '' in i:
p=round(float(z)*0.0001)
elif '亿' in i:
p=round(float(z))
tmoney.append(p)
#应用matplotlib作图
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
tmoney.reverse()
lcompany.reverse()
y=tmoney
x=lcompany
colors=['green']*5+['cyan']*5+['blue']*5+['red']*5
plt.barh(x,y,height=0.7,color=colors)
plt.yticks(x,x)
for i,j in zip(x,y):
plt.text(j,i,j,fontsize=8)
plt.title('注册资金最多的公司 top20')
plt.ylabel('公司名字')
plt.xlabel('注册资本/亿元')
plt.show()