China_top500_company/陈乐鑫--python中国500强公司信息爬取代码.py

# -*- coding: utf-8 -*-
"""
Created on Tue May 31 13:44:13 2022

@author: 86136
"""
import requests
import re
#设置headers
headers={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36 Edg/101.0.1210.53'}
# 存储内容
message = []
#获取17个页面的公司数据，共510个公司

for page in range(17):
    # 组装url
    if page == 0:
        url = "https://top.chinaz.com/gongsi/index_zhuce.html"
    else:
        url = "https://top.chinaz.com/gongsi/index_zhuce_{}.html".format(page + 1)
    # 使用reqeusts模快发起 GET 请求
    response = requests.get(url, headers=headers)
    html = response.text
    # 使用 findall 函数来获取数据
    # 公司名
    company = re.findall('<a.*?target="_blank">(.+?)</a></h3>', html)
    money = re.findall('注册资本：</span>(.*?)</p>', html)
    pageOne = list(zip(company,money))
    # 合并列表
    message.extend(pageOne)
#取前500强的公司数据
message=message[0:500]


#数据写入
# 导入python中的内置模块csv
import csv
with open("content.csv", "w") as f:
    w = csv.writer(f)
    w.writerows(message)
#数据可视化
%matplotlib inline
import pandas as pd
# 读取数据
df = pd.read_csv("content.csv", names=["company",'money'],encoding='gbk')
lmoney=list(df['money'])[:20]
lcompany=list(df['company'])[:20]
#单位统一换算
tmoney=[]
for i in lmoney:
    p=''
    j=re.findall(r'\d+\.\d+',i)
    if j==[]:
        j=re.findall(r'\d+',i)
    for z in j:
        if '万' in i:
            p=round(float(z)*0.0001)
        elif '亿' in i:
            p=round(float(z))
    tmoney.append(p)
#应用matplotlib作图
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
tmoney.reverse()
lcompany.reverse()
y=tmoney
x=lcompany
colors=['green']*5+['cyan']*5+['blue']*5+['red']*5
plt.barh(x,y,height=0.7,color=colors)
plt.yticks(x,x)
for i,j in zip(x,y):
    plt.text(j,i,j,fontsize=8)
plt.title('注册资金最多的公司 top20')
plt.ylabel('公司名字')
plt.xlabel('注册资本/亿元')
plt.show()