|
|
import csv
|
|
|
import time
|
|
|
import random
|
|
|
import requests
|
|
|
import traceback
|
|
|
from time import sleep
|
|
|
from lxml import etree
|
|
|
|
|
|
|
|
|
# 获取首页源码
|
|
|
def get_page(url):
|
|
|
n = 3
|
|
|
while True:
|
|
|
try:
|
|
|
# sleep(random.uniform(1, 2)) # 随机出现1-2之间的数,包含小数
|
|
|
headers = {
|
|
|
'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36"
|
|
|
}
|
|
|
response = requests.get(url, headers=headers, timeout=10)
|
|
|
# print(response.text)
|
|
|
return response.text
|
|
|
except (TimeoutError, Exception):
|
|
|
n -= 1
|
|
|
if n == 0:
|
|
|
print('请求3次均失败,放弃此url请求,检查请求条件')
|
|
|
return
|
|
|
else:
|
|
|
print('请求失败,重新请求')
|
|
|
continue
|
|
|
|
|
|
|
|
|
#爬取一页数据
|
|
|
def parse_page(html,caipu):
|
|
|
try:
|
|
|
parse = etree.HTML(html) # 解析网页
|
|
|
items = parse.xpath('//li[@class="clearfix"]')
|
|
|
for item in items:
|
|
|
title = ''.join(item.xpath('./a/@title')).strip()
|
|
|
href = 'https://www.douguo.com' + ''.join(item.xpath('./div/a/@href')).strip()
|
|
|
peiliao = ''.join(item.xpath('./div/p/text()')).strip()
|
|
|
rate = ''.join(item.xpath('./div/div[1]/span[2]/text()')).strip()
|
|
|
id = ''.join(item.xpath('./div/div[2]/a[1]/text()')).strip()
|
|
|
img = ''.join(item.xpath('./a/img/@src')).strip()
|
|
|
item = {
|
|
|
'title': title,
|
|
|
'href': href,
|
|
|
'peiliao': peiliao,
|
|
|
'rate': rate,
|
|
|
'id': id,
|
|
|
'img': img,
|
|
|
'caipu':caipu
|
|
|
}
|
|
|
# print(item)
|
|
|
try:
|
|
|
with open('../外国食谱.csv', 'a', encoding='utf_8_sig', newline='') as fp:
|
|
|
# 'a'为追加模式(添加)
|
|
|
# utf_8_sig格式导出csv不乱码
|
|
|
fieldnames = ['title','href', 'peiliao', 'rate', 'id','img','caipu']
|
|
|
writer = csv.DictWriter(fp, fieldnames)
|
|
|
writer.writerow(item)
|
|
|
except Exception:
|
|
|
print(traceback.print_exc()) # 代替print e 来输出详细的异常信息
|
|
|
except Exception:
|
|
|
print(traceback.print_exc())
|
|
|
|
|
|
|
|
|
# 主函数
|
|
|
def main(x):
|
|
|
url = 'https://www.douguo.com/caipu/{}/0/{}'.format(caipu,x*20)
|
|
|
print(url)
|
|
|
html = get_page(url)
|
|
|
parse_page(html,caipu)
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
caipu_list = ['韩国', '日本料理','法国','意大利餐'] #中国菜系
|
|
|
start = time.time() # 计时
|
|
|
for caipu in caipu_list:
|
|
|
for i in range(40):
|
|
|
# 爬取多页
|
|
|
main(x=i)
|
|
|
time.sleep(random.uniform(1, 2))
|
|
|
print(caipu,"第" + str(i+1) + "页提取完成")
|
|
|
end = time.time()
|
|
|
print('共用时',round((end - start) / 60, 2), '分钟') |