You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

86 lines
3.4 KiB

import requests
import re
import csv
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36"
}
message = []
message2 = []
def transform(capitals):
capital1 = []
for capital in capitals:
if capital:
numb = re.findall(r'\d+\.?\d*', capital)
if capital[0:5] == "(人民币)" or capital[-5:] == "万元人民币" or capital[-5:] == "万人民币元" or capital[
-4:] == "万人民币" or capital[
-2:] == "万元" or capital[
-1:] == "":
capital = round(float(numb[0]) / 10000, 2)
elif capital[0:4] == "(港币)" or capital[-3:] == "万港币":
capital = round(float(numb[0]) * 0.9392 / 10000, 2)
elif capital[-3:] == "万美元":
capital = round(float(numb[0]) * 7.1876 / 10000, 2)
else:
capital = round(float(numb[0]), 2)
capital1.append(capital)
return capital1
def getInfo(html):
company = re.findall('<a.*?target="_blank">(.*?)</a></h3>', html)
capital = re.findall('注册资本:</span>(.*?)</p>', html)
capital1 = transform(capital)
pageTwo = list(zip(company, capital1))
message2.extend(pageTwo)
def save_message(list_message):
list2 = list_message[1]
with open("content2.csv", "w", encoding='utf-8') as f:
w = csv.writer(f)
w.writerows(list2)
def draw_first():
lc = pd.read_csv('content2.csv', names=["company", "capital"])
data = lc.sort_values(["capital"], ascending=False).head(20)
def autolabel(rects):
for rect in rects:
height = rect.get_height()
plt.text(rect.get_x() + rect.get_width() / 100. - 0.2, 1.1 * height, '%s' % int(height))
x_data = data['capital']
y_labels = data['company']
autolabel(plt.bar(x=range(0, len(x_data)), height=x_data, tick_label=y_labels))
plt.xlabel('公司排名', labelpad=20, fontsize=12, fontfamily='sans-serif', color='gray')
plt.ylabel('注册资金(单位:亿元)', labelpad=15, fontsize=12, fontfamily='sans-serif', color='gray')
plt.title('注册资金top20的公司', pad=20, fontsize=16, fontfamily='sans-serif', color='black')
plt.xticks(rotation=270)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gcf().set_facecolor('white')
plt.tight_layout()
def main():
for page in range(16):
if page == 0:
url = "https://top.chinaz.com/gongsitop/index_500top.html"
else:
url = "https://top.chinaz.com/gongsitop/index_500top_{}.html".format(page + 1)
response = requests.get(url, headers=headers)
html = response.text
getInfo(html)
list_message = [message, message2]
save_message(list_message)
draw_first()
if __name__ == '__main__':
main()