From 683c05b9af0aec3981eae06b2fbcae4a21904ea9 Mon Sep 17 00:00:00 2001
From: hnu202309010114 <1736774990@qq.com>
Date: Mon, 27 May 2024 13:03:40 +0800
Subject: [PATCH] ADD file via upload

---
 2024头歌大作业.py | 95 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 95 insertions(+)
 create mode 100644 2024头歌大作业.py
diff --git a/2024头歌大作业.py b/2024头歌大作业.py
new file mode 100644
index 0000000..a18e6c8
--- /dev/null
+++ b/2024头歌大作业.py
@@ -0,0 +1,95 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Sun May 26 12:41:54 2024
+
+@author: Panda
+"""
+
+import requests
+import pandas as pd
+import matplotlib.pyplot as plt
+from bs4 import BeautifulSoup
+
+url_head = "https://top.chinaz.com/gongsitop/index_500top"
+Headers = {
+        "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/104.0.5112.102"
+        }  
+
+#建立一个二重列表存储数据
+global tl
+tl = []
+
+#爬虫核心代码
+def crawl(index):
+    response = requests.get(url_head + index + ".html", headers = Headers)
+    if response.status_code == 200:
+        page_doc = response.text
+    
+    #使用BeautifulSoup4 解析页面Html源码
+    # soup = BeautifulSoup(page_doc, features='lxml') #旧版本lxml与bs4的写法
+    soup = BeautifulSoup(page_doc, features='html.parser') #至少lxml 4.6.3起的写法
+    
+    #获取所有<div>标签
+    div_list = soup.find_all('div')
+    
+    #遍历标签寻找属性class为"CoListTxt"的元素
+    for div in div_list:
+        attrib = div["class"]
+        if attrib == ["CoListTxt"]:
+            string = div.text
+            row = string.strip('\n').replace('\n',' ').split(' ')
+            row = list(filter(None, row))
+            company = row[0]
+            t = row[1]
+            person = row[1][t.find('：')+1:]
+            if len(row) >=8:
+                row[2] = row[2] + row[3]
+                row.remove(row[3])
+            t = row[2]
+            capital = row[2][t.find('：')+1:]
+            t = row[3]
+            date = row[3][t.find('：')+1:]
+            if len(row) >=7:
+                t = row[4]
+                stock_type = row[4][t.find('：')+1:]
+                t = row[5]
+                stock_code = row[5][t.find('：')+1:]
+            else:
+                stock_type = '暂无'
+                stock_code = None
+            row = [company, person, capital, date, stock_type, stock_code]
+            #将爬取的单条数据整合进二重列表
+            tl.append(row)
+
+
+def main():
+    inds = []
+    pages = 16 #网站总页数
+    for i in range(1, pages+1):
+        ch = '_' + str(i)
+        inds.append(ch)
+    inds[0] = ''
+    for ind in inds:
+        crawl(ind)
+    #将二重列表转化为Dataframe        
+    df = pd.DataFrame(tl, columns=['公司', '法人', '注册资本', '注册时间', '证券', '股票代码'])
+    #保存为csv文件
+    df.to_csv('result.csv', index=False, encoding='utf_8_sig')
+    
+    #分析证券占比
+    percentages = df['证券'].value_counts(normalize=True) * 100
+    print(percentages)
+
+    #可视化占比
+    percentages = percentages.head(3)
+    plt.rcParams['font.sans-serif'] = 'SimHei'
+    plt.rcParams['axes.unicode_minus'] = False
+    percentages.plot.pie(autopct='%1.1f%%', startangle=90, shadow=False, labels=percentages.index)
+    plt.title('500强企业证券类别占比饼图')
+    plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
+    plt.show()
+    
+    return None
+
+if __name__=='__main__':
+    main()
\ No newline at end of file