You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
China_top500_company/陈乐鑫--python中国500强公司信息爬取代码.py

96 lines
2.3 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

# -*- coding: utf-8 -*-
"""
Created on Tue May 31 13:44:13 2022
@author: 86136
"""
import requests
import re
#设置headers
headers={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36 Edg/101.0.1210.53'}
# 存储内容
message = []
#获取17个页面的公司数据共510个公司
for page in range(17):
# 组装url
if page == 0:
url = "https://top.chinaz.com/gongsi/index_zhuce.html"
else:
url = "https://top.chinaz.com/gongsi/index_zhuce_{}.html".format(page + 1)
# 使用reqeusts模快发起 GET 请求
response = requests.get(url, headers=headers)
html = response.text
# 使用 findall 函数来获取数据
# 公司名
company = re.findall('<a.*?target="_blank">(.+?)</a></h3>', html)
money = re.findall('注册资本:</span>(.*?)</p>', html)
pageOne = list(zip(company,money))
# 合并列表
message.extend(pageOne)
#取前500强的公司数据
message=message[0:500]
#数据写入
# 导入python中的内置模块csv
import csv
with open("content.csv", "w") as f:
w = csv.writer(f)
w.writerows(message)
#数据可视化
%matplotlib inline
import pandas as pd
# 读取数据
df = pd.read_csv("content.csv", names=["company",'money'],encoding='gbk')
lmoney=list(df['money'])[:20]
lcompany=list(df['company'])[:20]
#单位统一换算
tmoney=[]
for i in lmoney:
p=''
j=re.findall(r'\d+\.\d+',i)
if j==[]:
j=re.findall(r'\d+',i)
for z in j:
if '' in i:
p=round(float(z)*0.0001)
elif '亿' in i:
p=round(float(z))
tmoney.append(p)
#应用matplotlib作图
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
tmoney.reverse()
lcompany.reverse()
y=tmoney
x=lcompany
colors=['green']*5+['cyan']*5+['blue']*5+['red']*5
plt.barh(x,y,height=0.7,color=colors)
plt.yticks(x,x)
for i,j in zip(x,y):
plt.text(j,i,j,fontsize=8)
plt.title('注册资金最多的公司 top20')
plt.ylabel('公司名字')
plt.xlabel('注册资本/亿元')
plt.show()