diff --git a/2024头歌大作业2.py b/2024头歌大作业2.py
new file mode 100644
index 0000000..99e486d
--- /dev/null
+++ b/2024头歌大作业2.py
@@ -0,0 +1,102 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Sun May 26 19:24:00 2024
+
+@author: Panda
+"""
+
+import requests
+import pandas as pd
+import re
+import matplotlib.pyplot as plt
+from bs4 import BeautifulSoup
+
+url_head = "https://top.chinaz.com/gongsi/index_zhuce"
+Headers = {
+ "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/104.0.5112.102"
+ }
+
+#建立一个二重列表存储数据
+global tl
+tl = []
+
+#爬虫核心代码
+def crawl(index):
+ response = requests.get(url_head + index + ".html", headers = Headers)
+ if response.status_code == 200:
+ page_doc = response.text
+
+ #使用BeautifulSoup4 解析页面Html源码
+ # soup = BeautifulSoup(page_doc, features='lxml') #旧版本lxml与bs4的写法
+ soup = BeautifulSoup(page_doc, features='html.parser') #至少lxml 4.6.3起的写法
+
+ #获取所有
标签
+ div_list = soup.find_all('div')
+
+ #遍历标签寻找属性class为"CoListTxt"的元素
+ for div in div_list:
+ attrib = div["class"]
+ if attrib == ["CoListTxt"]:
+ string = div.text
+ row = string.strip('\n').replace('\n',' ').split(' ')
+ row = list(filter(None, row))
+ company = row[0]
+ t = row[1]
+ person = row[1][t.find(':')+1:]
+ if len(row) >=8:
+ row[2] = row[2] + row[3]
+ row.remove(row[3])
+ t = row[2]
+ capital = row[2][t.find(':')+1:]
+ #转换为万元单位的数字
+ capnum = [float(t) for t in re.findall(r"-?\d+\.?\d*", capital)][0]
+ if '亿' in capital:
+ capnum = capnum * 10000
+ t = row[3]
+ date = row[3][t.find(':')+1:]
+ if len(row) >=7:
+ t = row[4]
+ stock_type = row[4][t.find(':')+1:]
+ t = row[5]
+ stock_code = row[5][t.find(':')+1:]
+ else:
+ stock_type = '暂无'
+ stock_code = None
+ row = [company, person, capnum, date, stock_type, stock_code]
+ #将爬取的单条数据整合进二重列表
+ tl.append(row)
+
+
+def main():
+ inds = []
+ pages = 17 #需获取的网站页数
+ for i in range(1, pages+1):
+ ch = '_' + str(i)
+ inds.append(ch)
+ inds[0] = ''
+ for ind in inds:
+ crawl(ind)
+ #将二重列表转化为Dataframe
+ df = pd.DataFrame(tl, columns=['公司', '法人', '注册资本', '注册时间', '证券', '股票代码'])
+ #保存为csv文件
+ df.to_csv('result_2.csv', index=False, encoding='utf_8_sig')
+
+ #输出注册资金500强信息
+ print(df[['公司', '注册资本']].iloc[:20])
+
+ #可视化绘制条形图
+ plt.rcParams['font.sans-serif'] = 'SimHei'
+ plt.rcParams['axes.unicode_minus'] = False
+ ndf = df[['公司', '注册资本']].iloc[:20]
+ ndf = ndf.set_index(ndf['公司'])
+ ndf.plot(kind='bar')
+ plt.xlabel('公司名')
+ plt.xticks(rotation=90)
+ plt.ylabel('注册资金(万元)')
+ plt.title('注册资金前20强公司')
+
+ plt.show()
+ return None
+
+if __name__=='__main__':
+ main()
\ No newline at end of file