import requests url="https://top.chinaz.com/gongsi/index_HuNan_zhuce.html" headers={ "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36" } response=requests.get(url, headers=headers) html=response.text import re company=re.findall('(.+?)', html) person=re.findall('法定代表人:(.+?)

', html) signDate=re.findall('注册时间:(.+?)

', html) category=re.findall('证券类别:(.+?)

', html) pageOne=list(zip(company, person, signDate, category)) message=[] for page in range(16): if page==0: url="https://top.chinaz.com/gongsi/index_HuNan_zhuce.html" else: url="https://top.chinaz.com/gongsi/index_HuNan_zhuce_{}.html".format(page + 1) response=requests.get(url, headers=headers) html=response.text message.extend(pageOne) import csv with open("content.csv", "w") as f: w=csv.writer(f) w.writerows(message) import pandas as pd df=pd.read_csv("content.csv", names=["company", "person", "signDate", "category"],encoding='cp936') df.head() df1=df.groupby("category").count()["company"] import matplotlib.pyplot as plt plt.rcParams['font.sans-serif']=['SimHei'] names=df1.index values=df1.values fig1,ax1=plt.subplots() ax1.pie(values, labels=names, autopct='%3.1f%%',explode=(0.11,0.2),textprops={'fontsize':18,'color':'k'},radius=2, shadow=True, startangle=180,colors=('c','y')) ax1.set_title('湖南企业五百强') ax1.axis('equal') plt.show()