diff --git a/500强(2).py b/500强(2).py new file mode 100644 index 0000000..03cc518 --- /dev/null +++ b/500强(2).py @@ -0,0 +1,85 @@ +import requests +import re +import csv +import pandas as pd +import matplotlib.pyplot as plt +import warnings + +warnings.filterwarnings("ignore") +headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36" +} +message = [] +message2 = [] + + +def transform(capitals): + capital1 = [] + for capital in capitals: + if capital: + numb = re.findall(r'\d+\.?\d*', capital) + if capital[0:5] == "(人民币)" or capital[-5:] == "万元人民币" or capital[-5:] == "万人民币元" or capital[ + -4:] == "万人民币" or capital[ + -2:] == "万元" or capital[ + -1:] == "万": + capital = round(float(numb[0]) / 10000, 2) + elif capital[0:4] == "(港币)" or capital[-3:] == "万港币": + capital = round(float(numb[0]) * 0.9392 / 10000, 2) + elif capital[-3:] == "万美元": + capital = round(float(numb[0]) * 7.1876 / 10000, 2) + else: + capital = round(float(numb[0]), 2) + capital1.append(capital) + return capital1 +def getInfo(html): + company = re.findall('(.*?)', html) + capital = re.findall('注册资本:(.*?)

', html) + capital1 = transform(capital) + pageTwo = list(zip(company, capital1)) + message2.extend(pageTwo) + +def save_message(list_message): + list2 = list_message[1] + with open("content2.csv", "w", encoding='utf-8') as f: + w = csv.writer(f) + w.writerows(list2) + +def draw_first(): + lc = pd.read_csv('content2.csv', names=["company", "capital"]) + data = lc.sort_values(["capital"], ascending=False).head(20) + + def autolabel(rects): + for rect in rects: + height = rect.get_height() + plt.text(rect.get_x() + rect.get_width() / 100. - 0.2, 1.1 * height, '%s' % int(height)) + x_data = data['capital'] + y_labels = data['company'] + autolabel(plt.bar(x=range(0, len(x_data)), height=x_data, tick_label=y_labels)) + plt.xlabel('公司排名', labelpad=20, fontsize=12, fontfamily='sans-serif', color='gray') + plt.ylabel('注册资金(单位:亿元)', labelpad=15, fontsize=12, fontfamily='sans-serif', color='gray') + plt.title('注册资金top20的公司', pad=20, fontsize=16, fontfamily='sans-serif', color='black') + plt.xticks(rotation=270) + plt.gca().spines['top'].set_visible(False) + plt.gca().spines['right'].set_visible(False) + plt.gcf().set_facecolor('white') + plt.tight_layout() + + + +def main(): + for page in range(16): + if page == 0: + url = "https://top.chinaz.com/gongsitop/index_500top.html" + else: + url = "https://top.chinaz.com/gongsitop/index_500top_{}.html".format(page + 1) + + response = requests.get(url, headers=headers) + html = response.text + getInfo(html) + list_message = [message, message2] + save_message(list_message) + draw_first() + + +if __name__ == '__main__': + main()