mytop20of500NB/500强爬取.py

# -*- coding: utf-8 -*-
"""
Created on Fri Jun  3 14:51:42 2022

@author: 86136
"""

#### 代码窗口
import requests
url = "https://top.chinaz.com/gongsi/index_zhuce.html"
headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36"}
response = requests.get(url, headers=headers)
html = response.text
import re
message=[]
for page in range(16):
    if page == 0:
        url = "https://top.chinaz.com/gongsi/index_zhuce.html"
    else:
        url = "https://top.chinaz.com/gongsi/index_zhuce.html".format(page + 1)
    response = requests.get(url, headers=headers)
    html = response.text
    company = re.findall('<a.*?target="_blank">(.+?)</a></h3>', html)
    money = re.findall('注册资本：</span>(.*?)</p>', html)
    pageOne = list(zip(company, money))
    message.extend(pageOne)
message=message[0:500]
import csv
with open("content.csv", "w") as f:
    w = csv.writer(f)
    w.writerows(message)
import pandas as pd
df = pd.read_csv("content.csv", names=["company", "money"],encoding='gbk')
df=df.fillna('0')
money1=list(df['money'])[:20]
company1=list(df['company'])[:20]
money2=[]
for i in money1:
    p=''
    j=re.findall(r'\d+\.\d+',i)
    if j==[]:
        j=re.findall(r'\d+',i)
    for q in j:
        if '万' in i:
            p=round(float(q)*0.0001)
        elif '亿' in i:
            p=round(float(q))
    money2.append(p)
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']  
plt.rcParams['axes.unicode_minus']=False
money2.reverse()
company1.reverse()
x=company1
y=money2
colors=['red']*5+['blue']*5+['green']*5+['yellow']*5
plt.barh(x,y,height=0.7,color=colors)
plt.yticks(x,x)
for i,j in zip(x,y):
    plt.text(j,i,j,fontsize=8)
plt.title('注册资金最多的公司top20')
plt.xlabel('公司')
plt.ylabel('注册资金/亿元')
plt.show()
ADD file via upload 3 years ago			`# -- coding: utf-8 --`
			`"""`
			`Created on Fri Jun 3 14:51:42 2022`

			`@author: 86136`
			`"""`

			`#### 代码窗口`
			`import requests`
			`url = "https://top.chinaz.com/gongsi/index_zhuce.html"`
			`headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36"}`
			`response = requests.get(url, headers=headers)`
			`html = response.text`
			`import re`
			`message=[]`
			`for page in range(16):`
			`if page == 0:`
			`url = "https://top.chinaz.com/gongsi/index_zhuce.html"`
			`else:`
			`url = "https://top.chinaz.com/gongsi/index_zhuce.html".format(page + 1)`
			`response = requests.get(url, headers=headers)`
			`html = response.text`
			`company = re.findall('<a.*?target="_blank">(.+?)</a></h3>', html)`
			`money = re.findall('注册资本：</span>(.*?)</p>', html)`
			`pageOne = list(zip(company, money))`
			`message.extend(pageOne)`
			`message=message[0:500]`
			`import csv`
			`with open("content.csv", "w") as f:`
			`w = csv.writer(f)`
			`w.writerows(message)`
			`import pandas as pd`
			`df = pd.read_csv("content.csv", names=["company", "money"],encoding='gbk')`
			`df=df.fillna('0')`
			`money1=list(df['money'])[:20]`
			`company1=list(df['company'])[:20]`
			`money2=[]`
			`for i in money1:`
			`p=''`
			`j=re.findall(r'\d+\.\d+',i)`
			`if j==[]:`
			`j=re.findall(r'\d+',i)`
			`for q in j:`
			`if '万' in i:`
			`p=round(float(q)*0.0001)`
			`elif '亿' in i:`
			`p=round(float(q))`
			`money2.append(p)`
			`import matplotlib.pyplot as plt`
			`plt.rcParams['font.sans-serif'] = ['SimHei']`
			`plt.rcParams['axes.unicode_minus']=False`
			`money2.reverse()`
			`company1.reverse()`
			`x=company1`
			`y=money2`
			`colors=['red']5+['blue']5+['green']5+['yellow']5`
			`plt.barh(x,y,height=0.7,color=colors)`
			`plt.yticks(x,x)`
			`for i,j in zip(x,y):`
			`plt.text(j,i,j,fontsize=8)`
			`plt.title('注册资金最多的公司top20')`
			`plt.xlabel('公司')`
			`plt.ylabel('注册资金/亿元')`
			`plt.show()`