You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Crawler/中国500强公司信息爬取.py

93 lines
3.0 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import requests
# 设置请求头信息
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36 Edg/101.0.1210.47"
}
import re
# 存储内容
message = []
# 总共16个页面的数据
for page in range(16):
# 组装url # 请求的url
if page == 0:
# https://top.chinaz.com/gongsitop/index_500top.html
url = "https://top.chinaz.com/gongsitop/index_500top.html"
else:
# https://top.chinaz.com/gongsitop/index_500top_2.html
url = "https://top.chinaz.com/gongsitop/index_500top_{}.html".format(page + 1)
# 使用reqeusts模快发起 GET 请求
response = requests.get(url, headers=headers)
# 获取请求的返回结果
html = response.text
# 使用 findall 函数来获取数据
# 公司名
# <a href="/company/ZhongGuoShiYouHuaGongGuFen.html" target="_blank">中国石油化工股份有限公司</a></h3>
company = re.findall('<a.*?target="_blank">(.+?)</a></h3>', html)
# 注册资本
# 注册资本:</span>1210.71亿元</p>
money = re.findall('注册资本:</span>(.*?)</p>', html)
# 对应项目的信息进行打包
pageOne = list(zip(company, money))
# 合并列表
message.extend(pageOne)
import csv
with open("content01.csv", "w") as f:
w = csv.writer(f)
w.writerows(message)
import pandas as pd
# 读取数据
df = pd.read_csv("content01.csv", names=["company", "money"], encoding='gbk')
# 填充空元素
df = df.fillna('0')
# 注册资本单位转换
company_all=list(df['company'])
money_all=list(df['money'])
money_all_number=[]
for i in money_all:
p=''
for j in i:
if j in '0123456789.':
p=p+j
p=float(p)
if '' in i:
p=int(p*10000+0.5)
if '亿' in i:
p=int(p*100000000+0.5)
# 汇率取自2022年5月26日
if '美元' in i:
p=int(p*6.7388+0.5)
if '' in i:
p=int(p*0.8585+0.5)
money_all_number.append(p)
data={'company':company_all,'money':money_all_number}
df=pd.DataFrame(data)
df=df.sort_values(by=['money'],ascending=False)
# 取注册资本最多的前二十公司名称和注册资本
company_top20=df.iloc[0:20,0]
money_top20=df.iloc[0:20,1]
#公司名称竖状表示
company_top20_y=[]
for i in company_top20:
s=''
for j in i:
p=j
if j=='':
p=''
if j=='':
p=''
s=s+p+'\n'
company_top20_y.append(s)
# 在jupyter中直接展示图像
import matplotlib.pyplot as plt
# 步骤一替换sans-serif字体
plt.rcParams['font.sans-serif'] = ['SimHei']
# 步骤二(解决坐标轴负数的负号显示问题)
plt.rcParams['axes.unicode_minus'] = False
plt.bar(range(20),money_top20)
plt.xticks(range(20),company_top20_y)
plt.yticks([5e10,10e10,15e10,20e10,25e10,30e10,35e10],['500亿','1000亿','1500亿','2000亿','2500亿','3000亿','3500亿'])
plt.ylabel('注册资金')
plt.title('中国500强公司注册资金前二十的公司及注册资金示意图')
plt.show()