#pachong import requests # 请求的url url = "https://top.chinaz.com/gongsitop/index_500top.html" # 设置请求头信息 headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36" } # 使用reqeusts模快发起 GET 请求 response = requests.get(url, headers=headers) # 获取请求的返回结果 html = response.text import re # 使用 findall 函数来获取数据 # 公司名 company = re.findall('(.+?)', html) # 法定代表人 person = re.findall('法定代表人:(.+?)

', html) # 注册时间 signDate = re.findall('注册时间:(.+?)

', html) # 证券类别 category = re.findall('证券类别:(.+?)

', html) pageOne = list(zip(company, person, signDate, category)) message = [] message2=[] # 总共16个页面的数据 for page in range(16): # 组装url if page == 0: url = "https://top.chinaz.com/gongsitop/index_500top.html" else: url = "https://top.chinaz.com/gongsitop/index_500top_{}.html".format(page + 1) # 使用reqeusts模快发起 GET 请求 response = requests.get(url, headers=headers) html = response.text # 使用 findall 函数来获取数据 # 公司名 company = re.findall('(.+?)', html) # 法定代表人 person = re.findall('法定代表人:(.*?)

', html) # 注册时间 signDate = re.findall('注册时间:(.*?)

', html) # 证券类别 category = re.findall('证券类别:(.*?)

', html) pageOne = list(zip(company, person, signDate, category)) money = re.findall('注册资本:',html) pageTwo=list(zip(company,money)) # 合并列表 message.extend(pageOne) message2.extend(pageTwo) import csv with open("content.csv", "w",encoding='utf-8') as f: w = csv.writer(f) w.writerows(message) with open("content2.csv", "w",encoding='utf-8') as f: w = csv.writer(f) w.writerows(message2) import pandas as pd import chardet with open('content.csv','rb') as f: rawdata=f.read() result=chardet.detect(rawdata) print(result['encoding']) encoding=result['encoding'] # 读取数据 df = pd.read_csv("content.csv", names=["company", "person", "signDate", "category"], encoding=encoding) df.head() df.info() df1 = df.groupby("category").count()["company"] import pandas as pd import chardet with open('content2.csv','rb') as f: rawdata2=f.read() result2=chardet.detect(rawdata) print(result2['encoding']) encoding=result2['encoding'] # 读取数据2 df = pd.read_csv("content2.csv", names=["company",'money'], encoding=encoding) df.head() df.info() sorted_data=df.sort_values(by='money',ascending=False) top_20=sorted_data.head(20) df2 = top_20.groupby("money").count()["company"] import matplotlib.pyplot as plt # 用黑体显示中文 plt.rcParams['font.sans-serif'] = ['SimHei'] labels=df2.index sizes=df2.values fig1,ax2=plt.subplots() ax2.pie(sizes,labels=labels,autopct='%d%%',radius=2,textprops={'fontsize': 20}, shadow=False, startangle=90) ax2.axis() plt.title("top20") plt.show()