You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
CCCC/2024头歌大作业2.py

102 lines
3.2 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

# -*- coding: utf-8 -*-
"""
Created on Sun May 26 19:24:00 2024
@author: Panda
"""
import requests
import pandas as pd
import re
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
url_head = "https://top.chinaz.com/gongsi/index_zhuce"
Headers = {
"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/104.0.5112.102"
}
#建立一个二重列表存储数据
global tl
tl = []
#爬虫核心代码
def crawl(index):
response = requests.get(url_head + index + ".html", headers = Headers)
if response.status_code == 200:
page_doc = response.text
#使用BeautifulSoup4 解析页面Html源码
# soup = BeautifulSoup(page_doc, features='lxml') #旧版本lxml与bs4的写法
soup = BeautifulSoup(page_doc, features='html.parser') #至少lxml 4.6.3起的写法
#获取所有<div>标签
div_list = soup.find_all('div')
#遍历标签寻找属性class为"CoListTxt"的元素
for div in div_list:
attrib = div["class"]
if attrib == ["CoListTxt"]:
string = div.text
row = string.strip('\n').replace('\n',' ').split(' ')
row = list(filter(None, row))
company = row[0]
t = row[1]
person = row[1][t.find('')+1:]
if len(row) >=8:
row[2] = row[2] + row[3]
row.remove(row[3])
t = row[2]
capital = row[2][t.find('')+1:]
#转换为万元单位的数字
capnum = [float(t) for t in re.findall(r"-?\d+\.?\d*", capital)][0]
if '亿' in capital:
capnum = capnum * 10000
t = row[3]
date = row[3][t.find('')+1:]
if len(row) >=7:
t = row[4]
stock_type = row[4][t.find('')+1:]
t = row[5]
stock_code = row[5][t.find('')+1:]
else:
stock_type = '暂无'
stock_code = None
row = [company, person, capnum, date, stock_type, stock_code]
#将爬取的单条数据整合进二重列表
tl.append(row)
def main():
inds = []
pages = 17 #需获取的网站页数
for i in range(1, pages+1):
ch = '_' + str(i)
inds.append(ch)
inds[0] = ''
for ind in inds:
crawl(ind)
#将二重列表转化为Dataframe
df = pd.DataFrame(tl, columns=['公司', '法人', '注册资本', '注册时间', '证券', '股票代码'])
#保存为csv文件
df.to_csv('result_2.csv', index=False, encoding='utf_8_sig')
#输出注册资金500强信息
print(df[['公司', '注册资本']].iloc[:20])
#可视化绘制条形图
plt.rcParams['font.sans-serif'] = 'SimHei'
plt.rcParams['axes.unicode_minus'] = False
ndf = df[['公司', '注册资本']].iloc[:20]
ndf = ndf.set_index(ndf['公司'])
ndf.plot(kind='bar')
plt.xlabel('公司名')
plt.xticks(rotation=90)
plt.ylabel('注册资金(万元)')
plt.title('注册资金前20强公司')
plt.show()
return None
if __name__=='__main__':
main()