# -*- coding: utf-8 -*- """ Created on Sun May 26 19:24:00 2024 @author: Panda """ import requests import pandas as pd import re import matplotlib.pyplot as plt from bs4 import BeautifulSoup url_head = "https://top.chinaz.com/gongsi/index_zhuce" Headers = { "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/104.0.5112.102" } #建立一个二重列表存储数据 global tl tl = [] #爬虫核心代码 def crawl(index): response = requests.get(url_head + index + ".html", headers = Headers) if response.status_code == 200: page_doc = response.text #使用BeautifulSoup4 解析页面Html源码 # soup = BeautifulSoup(page_doc, features='lxml') #旧版本lxml与bs4的写法 soup = BeautifulSoup(page_doc, features='html.parser') #至少lxml 4.6.3起的写法 #获取所有
标签 div_list = soup.find_all('div') #遍历标签寻找属性class为"CoListTxt"的元素 for div in div_list: attrib = div["class"] if attrib == ["CoListTxt"]: string = div.text row = string.strip('\n').replace('\n',' ').split(' ') row = list(filter(None, row)) company = row[0] t = row[1] person = row[1][t.find(':')+1:] if len(row) >=8: row[2] = row[2] + row[3] row.remove(row[3]) t = row[2] capital = row[2][t.find(':')+1:] #转换为万元单位的数字 capnum = [float(t) for t in re.findall(r"-?\d+\.?\d*", capital)][0] if '亿' in capital: capnum = capnum * 10000 t = row[3] date = row[3][t.find(':')+1:] if len(row) >=7: t = row[4] stock_type = row[4][t.find(':')+1:] t = row[5] stock_code = row[5][t.find(':')+1:] else: stock_type = '暂无' stock_code = None row = [company, person, capnum, date, stock_type, stock_code] #将爬取的单条数据整合进二重列表 tl.append(row) def main(): inds = [] pages = 17 #需获取的网站页数 for i in range(1, pages+1): ch = '_' + str(i) inds.append(ch) inds[0] = '' for ind in inds: crawl(ind) #将二重列表转化为Dataframe df = pd.DataFrame(tl, columns=['公司', '法人', '注册资本', '注册时间', '证券', '股票代码']) #保存为csv文件 df.to_csv('result_2.csv', index=False, encoding='utf_8_sig') #输出注册资金500强信息 print(df[['公司', '注册资本']].iloc[:20]) #可视化绘制条形图 plt.rcParams['font.sans-serif'] = 'SimHei' plt.rcParams['axes.unicode_minus'] = False ndf = df[['公司', '注册资本']].iloc[:20] ndf = ndf.set_index(ndf['公司']) ndf.plot(kind='bar') plt.xlabel('公司名') plt.xticks(rotation=90) plt.ylabel('注册资金(万元)') plt.title('注册资金前20强公司') plt.show() return None if __name__=='__main__': main()