|
|
@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
Created on Sun May 26 19:24:00 2024
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@author: Panda
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import requests
|
|
|
|
|
|
|
|
import pandas as pd
|
|
|
|
|
|
|
|
import re
|
|
|
|
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
url_head = "https://top.chinaz.com/gongsi/index_zhuce"
|
|
|
|
|
|
|
|
Headers = {
|
|
|
|
|
|
|
|
"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/104.0.5112.102"
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#建立一个二重列表存储数据
|
|
|
|
|
|
|
|
global tl
|
|
|
|
|
|
|
|
tl = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#爬虫核心代码
|
|
|
|
|
|
|
|
def crawl(index):
|
|
|
|
|
|
|
|
response = requests.get(url_head + index + ".html", headers = Headers)
|
|
|
|
|
|
|
|
if response.status_code == 200:
|
|
|
|
|
|
|
|
page_doc = response.text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#使用BeautifulSoup4 解析页面Html源码
|
|
|
|
|
|
|
|
# soup = BeautifulSoup(page_doc, features='lxml') #旧版本lxml与bs4的写法
|
|
|
|
|
|
|
|
soup = BeautifulSoup(page_doc, features='html.parser') #至少lxml 4.6.3起的写法
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#获取所有<div>标签
|
|
|
|
|
|
|
|
div_list = soup.find_all('div')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#遍历标签寻找属性class为"CoListTxt"的元素
|
|
|
|
|
|
|
|
for div in div_list:
|
|
|
|
|
|
|
|
attrib = div["class"]
|
|
|
|
|
|
|
|
if attrib == ["CoListTxt"]:
|
|
|
|
|
|
|
|
string = div.text
|
|
|
|
|
|
|
|
row = string.strip('\n').replace('\n',' ').split(' ')
|
|
|
|
|
|
|
|
row = list(filter(None, row))
|
|
|
|
|
|
|
|
company = row[0]
|
|
|
|
|
|
|
|
t = row[1]
|
|
|
|
|
|
|
|
person = row[1][t.find(':')+1:]
|
|
|
|
|
|
|
|
if len(row) >=8:
|
|
|
|
|
|
|
|
row[2] = row[2] + row[3]
|
|
|
|
|
|
|
|
row.remove(row[3])
|
|
|
|
|
|
|
|
t = row[2]
|
|
|
|
|
|
|
|
capital = row[2][t.find(':')+1:]
|
|
|
|
|
|
|
|
#转换为万元单位的数字
|
|
|
|
|
|
|
|
capnum = [float(t) for t in re.findall(r"-?\d+\.?\d*", capital)][0]
|
|
|
|
|
|
|
|
if '亿' in capital:
|
|
|
|
|
|
|
|
capnum = capnum * 10000
|
|
|
|
|
|
|
|
t = row[3]
|
|
|
|
|
|
|
|
date = row[3][t.find(':')+1:]
|
|
|
|
|
|
|
|
if len(row) >=7:
|
|
|
|
|
|
|
|
t = row[4]
|
|
|
|
|
|
|
|
stock_type = row[4][t.find(':')+1:]
|
|
|
|
|
|
|
|
t = row[5]
|
|
|
|
|
|
|
|
stock_code = row[5][t.find(':')+1:]
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
stock_type = '暂无'
|
|
|
|
|
|
|
|
stock_code = None
|
|
|
|
|
|
|
|
row = [company, person, capnum, date, stock_type, stock_code]
|
|
|
|
|
|
|
|
#将爬取的单条数据整合进二重列表
|
|
|
|
|
|
|
|
tl.append(row)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
|
|
|
inds = []
|
|
|
|
|
|
|
|
pages = 17 #需获取的网站页数
|
|
|
|
|
|
|
|
for i in range(1, pages+1):
|
|
|
|
|
|
|
|
ch = '_' + str(i)
|
|
|
|
|
|
|
|
inds.append(ch)
|
|
|
|
|
|
|
|
inds[0] = ''
|
|
|
|
|
|
|
|
for ind in inds:
|
|
|
|
|
|
|
|
crawl(ind)
|
|
|
|
|
|
|
|
#将二重列表转化为Dataframe
|
|
|
|
|
|
|
|
df = pd.DataFrame(tl, columns=['公司', '法人', '注册资本', '注册时间', '证券', '股票代码'])
|
|
|
|
|
|
|
|
#保存为csv文件
|
|
|
|
|
|
|
|
df.to_csv('result_2.csv', index=False, encoding='utf_8_sig')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#输出注册资金500强信息
|
|
|
|
|
|
|
|
print(df[['公司', '注册资本']].iloc[:20])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#可视化绘制条形图
|
|
|
|
|
|
|
|
plt.rcParams['font.sans-serif'] = 'SimHei'
|
|
|
|
|
|
|
|
plt.rcParams['axes.unicode_minus'] = False
|
|
|
|
|
|
|
|
ndf = df[['公司', '注册资本']].iloc[:20]
|
|
|
|
|
|
|
|
ndf = ndf.set_index(ndf['公司'])
|
|
|
|
|
|
|
|
ndf.plot(kind='bar')
|
|
|
|
|
|
|
|
plt.xlabel('公司名')
|
|
|
|
|
|
|
|
plt.xticks(rotation=90)
|
|
|
|
|
|
|
|
plt.ylabel('注册资金(万元)')
|
|
|
|
|
|
|
|
plt.title('注册资金前20强公司')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
plt.show()
|
|
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__=='__main__':
|
|
|
|
|
|
|
|
main()
|