|
|
@ -207,3 +207,80 @@ def word_cloud(request):
|
|
|
|
# 从这里写你们的爬虫函数,例:
|
|
|
|
# 从这里写你们的爬虫函数,例:
|
|
|
|
# def spider_fun(url, web_name):
|
|
|
|
# def spider_fun(url, web_name):
|
|
|
|
# pass
|
|
|
|
# pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
import urllib.request,urllib.error #这里用urllib库实现requests库功能
|
|
|
|
|
|
|
|
import os
|
|
|
|
|
|
|
|
import re
|
|
|
|
|
|
|
|
import pandas as pd
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#正则提取信息
|
|
|
|
|
|
|
|
#findLink = re.compile(r'href="(.*?)"') #提取网址
|
|
|
|
|
|
|
|
findTitle = re.compile(r'target="_blank">(.*?)</a>') #提取标题
|
|
|
|
|
|
|
|
findPrice = re.compile(r'<span class="highlight">(.*?)</span>') #提取价格
|
|
|
|
|
|
|
|
findTag = re.compile(r'/" target="_blank">(.*?)</a></div>') #提取商品类型
|
|
|
|
|
|
|
|
findPlace = re.compile(r'<div class="ad-item-detail">(.*?)</div>') #提取地址
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def askURL(url): #访问网站获取信息
|
|
|
|
|
|
|
|
head = {
|
|
|
|
|
|
|
|
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
request = urllib.request.Request(url,headers=head)
|
|
|
|
|
|
|
|
html = ""
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
response = urllib.request.urlopen(request)
|
|
|
|
|
|
|
|
html = response.read().decode('utf-8')
|
|
|
|
|
|
|
|
#print(html) #test
|
|
|
|
|
|
|
|
except urllib.error.URLError as e:
|
|
|
|
|
|
|
|
if hasattr(e,'code'):
|
|
|
|
|
|
|
|
print(e.code)
|
|
|
|
|
|
|
|
if hasattr(e,'reason'):
|
|
|
|
|
|
|
|
print(e.reason)
|
|
|
|
|
|
|
|
return html
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def getData(baseurl): #提取需要的信息
|
|
|
|
|
|
|
|
datalist = []
|
|
|
|
|
|
|
|
for i in range(1,4): #调用访问网站函数,访问每一页的信息,这里只访问了几页
|
|
|
|
|
|
|
|
url = baseurl + str(i)
|
|
|
|
|
|
|
|
html = askURL(url)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
soup = BeautifulSoup(html,"html.parser")
|
|
|
|
|
|
|
|
for item in soup.find_all('div',class_="media-body"): #提取信息
|
|
|
|
|
|
|
|
#print(item) #test
|
|
|
|
|
|
|
|
data = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
item = str(item)
|
|
|
|
|
|
|
|
title = re.findall(findTitle,item)[0]
|
|
|
|
|
|
|
|
link = re.findall(findPlace,item)[0]
|
|
|
|
|
|
|
|
price = re.findall(findPrice,item)[0]
|
|
|
|
|
|
|
|
tag = re.findall(findTag,item)[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data.append(title)
|
|
|
|
|
|
|
|
data.append(link)
|
|
|
|
|
|
|
|
data.append(price)
|
|
|
|
|
|
|
|
data.append(tag)
|
|
|
|
|
|
|
|
datalist.append(data)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return datalist
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def saveData(savepath,datalist,web_name): #保存文件
|
|
|
|
|
|
|
|
name = ["标题","地址","价格","类型"]
|
|
|
|
|
|
|
|
file = pd.DataFrame(columns=name,data=datalist) #整合表头和数据
|
|
|
|
|
|
|
|
file.to_csv(savepath+'/lyh_tiaozaomarket.csv') #保存至当前路径,命名为xxx.csv
|
|
|
|
|
|
|
|
print('已保存%s信息' % web_name)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
'''---------代码开始的地方---------'''
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def begin_spider(url, web_name):
|
|
|
|
|
|
|
|
savepath = os.getcwd() #获取当前路径作为保存路径
|
|
|
|
|
|
|
|
datalist = getData(url)
|
|
|
|
|
|
|
|
saveData(savepath,datalist,web_name)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
|
|
|
url='https://guilin.baixing.com/ershou/?page='
|
|
|
|
|
|
|
|
web_name='桂林百姓网二手市场'
|
|
|
|
|
|
|
|
begin_spider(url, web_name)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
|
|
|
main()
|
|
|
|