From 8f635dc48bbc1b03be12fa486fe9ef6c53b64a6d Mon Sep 17 00:00:00 2001 From: hnu202110040418 Date: Sat, 4 Jun 2022 22:09:33 +0800 Subject: [PATCH] ADD file via upload --- 500强爬取.py | 65 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 500强爬取.py diff --git a/500强爬取.py b/500强爬取.py new file mode 100644 index 0000000..3462f7c --- /dev/null +++ b/500强爬取.py @@ -0,0 +1,65 @@ +# -*- coding: utf-8 -*- +""" +Created on Fri Jun 3 14:51:42 2022 + +@author: 86136 +""" + +#### 代码窗口 +import requests +url = "https://top.chinaz.com/gongsi/index_zhuce.html" +headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36"} +response = requests.get(url, headers=headers) +html = response.text +import re +message=[] +for page in range(16): + if page == 0: + url = "https://top.chinaz.com/gongsi/index_zhuce.html" + else: + url = "https://top.chinaz.com/gongsi/index_zhuce.html".format(page + 1) + response = requests.get(url, headers=headers) + html = response.text + company = re.findall('(.+?)', html) + money = re.findall('注册资本:(.*?)

', html) + pageOne = list(zip(company, money)) + message.extend(pageOne) +message=message[0:500] +import csv +with open("content.csv", "w") as f: + w = csv.writer(f) + w.writerows(message) +import pandas as pd +df = pd.read_csv("content.csv", names=["company", "money"],encoding='gbk') +df=df.fillna('0') +money1=list(df['money'])[:20] +company1=list(df['company'])[:20] +money2=[] +for i in money1: + p='' + j=re.findall(r'\d+\.\d+',i) + if j==[]: + j=re.findall(r'\d+',i) + for q in j: + if '万' in i: + p=round(float(q)*0.0001) + elif '亿' in i: + p=round(float(q)) + money2.append(p) +import matplotlib.pyplot as plt +plt.rcParams['font.sans-serif'] = ['SimHei'] +plt.rcParams['axes.unicode_minus']=False +money2.reverse() +company1.reverse() +x=company1 +y=money2 +colors=['red']*5+['blue']*5+['green']*5+['yellow']*5 +plt.barh(x,y,height=0.7,color=colors) +plt.yticks(x,x) +for i,j in zip(x,y): + plt.text(j,i,j,fontsize=8) +plt.title('注册资金最多的公司top20') +plt.xlabel('公司') +plt.ylabel('注册资金/亿元') +plt.show() +