diff --git a/SpiderWebsite/demo/views.py b/SpiderWebsite/demo/views.py index 5d53aa1..2e463d2 100644 --- a/SpiderWebsite/demo/views.py +++ b/SpiderWebsite/demo/views.py @@ -207,3 +207,80 @@ def word_cloud(request): # 从这里写你们的爬虫函数,例: # def spider_fun(url, web_name): # pass + +from bs4 import BeautifulSoup +import urllib.request,urllib.error #这里用urllib库实现requests库功能 +import os +import re +import pandas as pd + +#正则提取信息 +#findLink = re.compile(r'href="(.*?)"') #提取网址 +findTitle = re.compile(r'target="_blank">(.*?)') #提取标题 +findPrice = re.compile(r'(.*?)') #提取价格 +findTag = re.compile(r'/" target="_blank">(.*?)') #提取商品类型 +findPlace = re.compile(r'
(.*?)
') #提取地址 + +def askURL(url): #访问网站获取信息 + head = { + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36' +} + request = urllib.request.Request(url,headers=head) + html = "" + try: + response = urllib.request.urlopen(request) + html = response.read().decode('utf-8') + #print(html) #test + except urllib.error.URLError as e: + if hasattr(e,'code'): + print(e.code) + if hasattr(e,'reason'): + print(e.reason) + return html + +def getData(baseurl): #提取需要的信息 + datalist = [] + for i in range(1,4): #调用访问网站函数,访问每一页的信息,这里只访问了几页 + url = baseurl + str(i) + html = askURL(url) + + soup = BeautifulSoup(html,"html.parser") + for item in soup.find_all('div',class_="media-body"): #提取信息 + #print(item) #test + data = [] + + item = str(item) + title = re.findall(findTitle,item)[0] + link = re.findall(findPlace,item)[0] + price = re.findall(findPrice,item)[0] + tag = re.findall(findTag,item)[0] + + data.append(title) + data.append(link) + data.append(price) + data.append(tag) + datalist.append(data) + + return datalist + +def saveData(savepath,datalist,web_name): #保存文件 + name = ["标题","地址","价格","类型"] + file = pd.DataFrame(columns=name,data=datalist) #整合表头和数据 + file.to_csv(savepath+'/lyh_tiaozaomarket.csv') #保存至当前路径,命名为xxx.csv + print('已保存%s信息' % web_name) + + +'''---------代码开始的地方---------''' + +def begin_spider(url, web_name): + savepath = os.getcwd() #获取当前路径作为保存路径 + datalist = getData(url) + saveData(savepath,datalist,web_name) + +def main(): + url='https://guilin.baixing.com/ershou/?page=' + web_name='桂林百姓网二手市场' + begin_spider(url, web_name) + +if __name__ == "__main__": + main()