master
pztfkhlix 4 years ago
parent e054f0d58f
commit 012a66d2a8

@ -207,3 +207,80 @@ def word_cloud(request):
# 从这里写你们的爬虫函数,例: # 从这里写你们的爬虫函数,例:
# def spider_fun(url, web_name): # def spider_fun(url, web_name):
# pass # pass
from bs4 import BeautifulSoup
import urllib.request,urllib.error #这里用urllib库实现requests库功能
import os
import re
import pandas as pd
#正则提取信息
#findLink = re.compile(r'href="(.*?)"') #提取网址
findTitle = re.compile(r'target="_blank">(.*?)</a>') #提取标题
findPrice = re.compile(r'<span class="highlight">(.*?)</span>') #提取价格
findTag = re.compile(r'/" target="_blank">(.*?)</a></div>') #提取商品类型
findPlace = re.compile(r'<div class="ad-item-detail">(.*?)</div>') #提取地址
def askURL(url): #访问网站获取信息
head = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
}
request = urllib.request.Request(url,headers=head)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode('utf-8')
#print(html) #test
except urllib.error.URLError as e:
if hasattr(e,'code'):
print(e.code)
if hasattr(e,'reason'):
print(e.reason)
return html
def getData(baseurl): #提取需要的信息
datalist = []
for i in range(1,4): #调用访问网站函数,访问每一页的信息,这里只访问了几页
url = baseurl + str(i)
html = askURL(url)
soup = BeautifulSoup(html,"html.parser")
for item in soup.find_all('div',class_="media-body"): #提取信息
#print(item) #test
data = []
item = str(item)
title = re.findall(findTitle,item)[0]
link = re.findall(findPlace,item)[0]
price = re.findall(findPrice,item)[0]
tag = re.findall(findTag,item)[0]
data.append(title)
data.append(link)
data.append(price)
data.append(tag)
datalist.append(data)
return datalist
def saveData(savepath,datalist,web_name): #保存文件
name = ["标题","地址","价格","类型"]
file = pd.DataFrame(columns=name,data=datalist) #整合表头和数据
file.to_csv(savepath+'/lyh_tiaozaomarket.csv') #保存至当前路径命名为xxx.csv
print('已保存%s信息' % web_name)
'''---------代码开始的地方---------'''
def begin_spider(url, web_name):
savepath = os.getcwd() #获取当前路径作为保存路径
datalist = getData(url)
saveData(savepath,datalist,web_name)
def main():
url='https://guilin.baixing.com/ershou/?page='
web_name='桂林百姓网二手市场'
begin_spider(url, web_name)
if __name__ == "__main__":
main()

Loading…
Cancel
Save