From ce0d54720a0e7e24b7de670c3064fe76832519b3 Mon Sep 17 00:00:00 2001 From: hnu202309010114 <1736774990@qq.com> Date: Mon, 27 May 2024 12:56:22 +0800 Subject: [PATCH] ADD file via upload --- 2024头歌大作业2.py | 102 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 2024头歌大作业2.py diff --git a/2024头歌大作业2.py b/2024头歌大作业2.py new file mode 100644 index 0000000..99e486d --- /dev/null +++ b/2024头歌大作业2.py @@ -0,0 +1,102 @@ +# -*- coding: utf-8 -*- +""" +Created on Sun May 26 19:24:00 2024 + +@author: Panda +""" + +import requests +import pandas as pd +import re +import matplotlib.pyplot as plt +from bs4 import BeautifulSoup + +url_head = "https://top.chinaz.com/gongsi/index_zhuce" +Headers = { + "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/104.0.5112.102" + } + +#建立一个二重列表存储数据 +global tl +tl = [] + +#爬虫核心代码 +def crawl(index): + response = requests.get(url_head + index + ".html", headers = Headers) + if response.status_code == 200: + page_doc = response.text + + #使用BeautifulSoup4 解析页面Html源码 + # soup = BeautifulSoup(page_doc, features='lxml') #旧版本lxml与bs4的写法 + soup = BeautifulSoup(page_doc, features='html.parser') #至少lxml 4.6.3起的写法 + + #获取所有
标签 + div_list = soup.find_all('div') + + #遍历标签寻找属性class为"CoListTxt"的元素 + for div in div_list: + attrib = div["class"] + if attrib == ["CoListTxt"]: + string = div.text + row = string.strip('\n').replace('\n',' ').split(' ') + row = list(filter(None, row)) + company = row[0] + t = row[1] + person = row[1][t.find(':')+1:] + if len(row) >=8: + row[2] = row[2] + row[3] + row.remove(row[3]) + t = row[2] + capital = row[2][t.find(':')+1:] + #转换为万元单位的数字 + capnum = [float(t) for t in re.findall(r"-?\d+\.?\d*", capital)][0] + if '亿' in capital: + capnum = capnum * 10000 + t = row[3] + date = row[3][t.find(':')+1:] + if len(row) >=7: + t = row[4] + stock_type = row[4][t.find(':')+1:] + t = row[5] + stock_code = row[5][t.find(':')+1:] + else: + stock_type = '暂无' + stock_code = None + row = [company, person, capnum, date, stock_type, stock_code] + #将爬取的单条数据整合进二重列表 + tl.append(row) + + +def main(): + inds = [] + pages = 17 #需获取的网站页数 + for i in range(1, pages+1): + ch = '_' + str(i) + inds.append(ch) + inds[0] = '' + for ind in inds: + crawl(ind) + #将二重列表转化为Dataframe + df = pd.DataFrame(tl, columns=['公司', '法人', '注册资本', '注册时间', '证券', '股票代码']) + #保存为csv文件 + df.to_csv('result_2.csv', index=False, encoding='utf_8_sig') + + #输出注册资金500强信息 + print(df[['公司', '注册资本']].iloc[:20]) + + #可视化绘制条形图 + plt.rcParams['font.sans-serif'] = 'SimHei' + plt.rcParams['axes.unicode_minus'] = False + ndf = df[['公司', '注册资本']].iloc[:20] + ndf = ndf.set_index(ndf['公司']) + ndf.plot(kind='bar') + plt.xlabel('公司名') + plt.xticks(rotation=90) + plt.ylabel('注册资金(万元)') + plt.title('注册资金前20强公司') + + plt.show() + return None + +if __name__=='__main__': + main() \ No newline at end of file