diff --git a/大作业-爬虫.py b/大作业-爬虫.py new file mode 100644 index 0000000..3d74a0e --- /dev/null +++ b/大作业-爬虫.py @@ -0,0 +1,30 @@ +# -*- coding: utf-8 -*- +""" +Created on Sun May 15 20:46:07 2022 + +@author: 张景瑞 +""" +import requests +import re +headers={ + "user-agent":'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36 Edg/101.0.1210.39' + }#请求头信息 +message=[] +for page in range(10): + if page==0: + url="https://top.chinaz.com/gongsi/index_zhuce.html" + else: + url="https://top.chinaz.com/gongsi/index_zhuce_{}.html".format(page + 1) + response=requests.get(url, headers=headers) + html=response.text + name=re.findall('(.+?)', html) + fund=re.findall('注册资本:(.*?)

', html) + pack=list(zip(name,fund)) + message.extend(pack)#写入列表 +import csv +with open("content.csv", "w") as f: + w=csv.writer(f) + w.writerows(message) +import pandas as pd +df = pd.read_csv("content.csv", names=["name", "fund"],encoding='gb2312') +print(df.head(20))