You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
CCCC/2024头歌大作业.py

95 lines
3.0 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

# -*- coding: utf-8 -*-
"""
Created on Sun May 26 12:41:54 2024
@author: Panda
"""
import requests
import pandas as pd
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
url_head = "https://top.chinaz.com/gongsitop/index_500top"
Headers = {
"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/104.0.5112.102"
}
#建立一个二重列表存储数据
global tl
tl = []
#爬虫核心代码
def crawl(index):
response = requests.get(url_head + index + ".html", headers = Headers)
if response.status_code == 200:
page_doc = response.text
#使用BeautifulSoup4 解析页面Html源码
# soup = BeautifulSoup(page_doc, features='lxml') #旧版本lxml与bs4的写法
soup = BeautifulSoup(page_doc, features='html.parser') #至少lxml 4.6.3起的写法
#获取所有<div>标签
div_list = soup.find_all('div')
#遍历标签寻找属性class为"CoListTxt"的元素
for div in div_list:
attrib = div["class"]
if attrib == ["CoListTxt"]:
string = div.text
row = string.strip('\n').replace('\n',' ').split(' ')
row = list(filter(None, row))
company = row[0]
t = row[1]
person = row[1][t.find('')+1:]
if len(row) >=8:
row[2] = row[2] + row[3]
row.remove(row[3])
t = row[2]
capital = row[2][t.find('')+1:]
t = row[3]
date = row[3][t.find('')+1:]
if len(row) >=7:
t = row[4]
stock_type = row[4][t.find('')+1:]
t = row[5]
stock_code = row[5][t.find('')+1:]
else:
stock_type = '暂无'
stock_code = None
row = [company, person, capital, date, stock_type, stock_code]
#将爬取的单条数据整合进二重列表
tl.append(row)
def main():
inds = []
pages = 16 #网站总页数
for i in range(1, pages+1):
ch = '_' + str(i)
inds.append(ch)
inds[0] = ''
for ind in inds:
crawl(ind)
#将二重列表转化为Dataframe
df = pd.DataFrame(tl, columns=['公司', '法人', '注册资本', '注册时间', '证券', '股票代码'])
#保存为csv文件
df.to_csv('result.csv', index=False, encoding='utf_8_sig')
#分析证券占比
percentages = df['证券'].value_counts(normalize=True) * 100
print(percentages)
#可视化占比
percentages = percentages.head(3)
plt.rcParams['font.sans-serif'] = 'SimHei'
plt.rcParams['axes.unicode_minus'] = False
percentages.plot.pie(autopct='%1.1f%%', startangle=90, shadow=False, labels=percentages.index)
plt.title('500强企业证券类别占比饼图')
plt.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()
return None
if __name__=='__main__':
main()