You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

85 lines
2.9 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import csv
import time
import random
import requests
import traceback
from time import sleep
from lxml import etree
# 获取首页源码
def get_page(url):
n = 3
while True:
try:
# sleep(random.uniform(1, 2)) # 随机出现1-2之间的数包含小数
headers = {
'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36"
}
response = requests.get(url, headers=headers, timeout=10)
# print(response.text)
return response.text
except (TimeoutError, Exception):
n -= 1
if n == 0:
print('请求3次均失败放弃此url请求,检查请求条件')
return
else:
print('请求失败,重新请求')
continue
#爬取一页数据
def parse_page(html,caipu):
try:
parse = etree.HTML(html) # 解析网页
items = parse.xpath('//li[@class="clearfix"]')
for item in items:
title = ''.join(item.xpath('./a/@title')).strip()
href = 'https://www.douguo.com' + ''.join(item.xpath('./div/a/@href')).strip()
peiliao = ''.join(item.xpath('./div/p/text()')).strip()
rate = ''.join(item.xpath('./div/div[1]/span[2]/text()')).strip()
id = ''.join(item.xpath('./div/div[2]/a[1]/text()')).strip()
img = ''.join(item.xpath('./a/img/@src')).strip()
item = {
'title': title,
'href': href,
'peiliao': peiliao,
'rate': rate,
'id': id,
'img': img,
'caipu':caipu
}
# print(item)
try:
with open('../外国食谱.csv', 'a', encoding='utf_8_sig', newline='') as fp:
# 'a'为追加模式(添加)
# utf_8_sig格式导出csv不乱码
fieldnames = ['title','href', 'peiliao', 'rate', 'id','img','caipu']
writer = csv.DictWriter(fp, fieldnames)
writer.writerow(item)
except Exception:
print(traceback.print_exc()) # 代替print e 来输出详细的异常信息
except Exception:
print(traceback.print_exc())
# 主函数
def main(x):
url = 'https://www.douguo.com/caipu/{}/0/{}'.format(caipu,x*20)
print(url)
html = get_page(url)
parse_page(html,caipu)
if __name__ == '__main__':
caipu_list = ['韩国', '日本料理','法国','意大利餐'] #中国菜系
start = time.time() # 计时
for caipu in caipu_list:
for i in range(40):
# 爬取多页
main(x=i)
time.sleep(random.uniform(1, 2))
print(caipu,"" + str(i+1) + "页提取完成")
end = time.time()
print('共用时',round((end - start) / 60, 2), '分钟')