|
|
import requests
|
|
|
import re
|
|
|
import csv
|
|
|
import pandas as pd
|
|
|
import matplotlib.pyplot as plt
|
|
|
import warnings
|
|
|
|
|
|
warnings.filterwarnings("ignore")
|
|
|
headers = {
|
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36"
|
|
|
}
|
|
|
message = []
|
|
|
message2 = []
|
|
|
|
|
|
|
|
|
def transform(capitals):
|
|
|
capital1 = []
|
|
|
for capital in capitals:
|
|
|
if capital:
|
|
|
numb = re.findall(r'\d+\.?\d*', capital)
|
|
|
if capital[0:5] == "(人民币)" or capital[-5:] == "万元人民币" or capital[-5:] == "万人民币元" or capital[
|
|
|
-4:] == "万人民币" or capital[
|
|
|
-2:] == "万元" or capital[
|
|
|
-1:] == "万":
|
|
|
capital = round(float(numb[0]) / 10000, 2)
|
|
|
# 港币和人民币的兑换:0.9392
|
|
|
elif capital[0:4] == "(港币)" or capital[-3:] == "万港币":
|
|
|
capital = round(float(numb[0]) * 0.9392 / 10000, 2)
|
|
|
# 美元和人民币的兑换:7.1876
|
|
|
elif capital[-3:] == "万美元":
|
|
|
capital = round(float(numb[0]) * 7.1876 / 10000, 2)
|
|
|
else:
|
|
|
capital = round(float(numb[0]), 2)
|
|
|
capital1.append(capital)
|
|
|
return capital1
|
|
|
def getInfo(html):
|
|
|
company = re.findall('<a.*?target="_blank">(.*?)</a></h3>', html)
|
|
|
person = re.findall('法定代表人:</span>(.*?)</p>', html)
|
|
|
signDate = re.findall('注册时间:</span>(.*?)</p>', html)
|
|
|
category = re.findall('证券类别:</span>(.*?)</p>', html)
|
|
|
pageOne = list(zip(company, person, signDate, category))
|
|
|
message.extend(pageOne)
|
|
|
|
|
|
def save_message(list_message):
|
|
|
list1 = list_message[0]
|
|
|
with open("content1.csv", "w", encoding='utf-8') as f:
|
|
|
w = csv.writer(f)
|
|
|
w.writerows(list1)
|
|
|
|
|
|
def draw_first():
|
|
|
df = pd.read_csv("content1.csv", names=["company", "person", "signDate", "category"])
|
|
|
df1 = df.groupby("category").count()["company"]
|
|
|
plt.rcParams['font.sans-serif'] = ['SimHei']
|
|
|
labels = df1.index
|
|
|
sizes = df1.values
|
|
|
colors = ["blue", "red", "yellow", "green"]
|
|
|
explode = (0, 0, 0, 0.1)
|
|
|
plt.figure(figsize=(20, 10), dpi=80)
|
|
|
plt.subplot(131)
|
|
|
patches, text1, text2 = plt.pie(sizes, labels=labels, explode=explode, autopct='%3.2f%%', colors=colors, radius=1.5,
|
|
|
textprops={'fontsize': 15}, shadow=False, startangle=90)
|
|
|
plt.axis("equal")
|
|
|
plt.legend()
|
|
|
plt.title('中国500强公司各证券类型占比', pad=15, fontsize='xx-large', fontweight='heavy')
|
|
|
|
|
|
def main():
|
|
|
for page in range(16):
|
|
|
if page == 0:
|
|
|
url = "https://top.chinaz.com/gongsitop/index_500top.html"
|
|
|
else:
|
|
|
url = "https://top.chinaz.com/gongsitop/index_500top_{}.html".format(page + 1)
|
|
|
|
|
|
response = requests.get(url, headers=headers)
|
|
|
html = response.text
|
|
|
getInfo(html)
|
|
|
list_message = [message, message2]
|
|
|
save_message(list_message)
|
|
|
draw_first()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
main()
|
|
|
|
|
|
|