import csv import time import random import requests import traceback from time import sleep from lxml import etree # 获取首页源码 def get_page(url): n = 3 while True: try: # sleep(random.uniform(1, 2)) # 随机出现1-2之间的数,包含小数 headers = { 'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36" } response = requests.get(url, headers=headers, timeout=10) # print(response.text) return response.text except (TimeoutError, Exception): n -= 1 if n == 0: print('请求3次均失败,放弃此url请求,检查请求条件') return else: print('请求失败,重新请求') continue #爬取一页数据 def parse_page(html,caipu): try: parse = etree.HTML(html) # 解析网页 items = parse.xpath('//li[@class="clearfix"]') for item in items: title = ''.join(item.xpath('./a/@title')).strip() href = 'https://www.douguo.com' + ''.join(item.xpath('./div/a/@href')).strip() peiliao = ''.join(item.xpath('./div/p/text()')).strip() rate = ''.join(item.xpath('./div/div[1]/span[2]/text()')).strip() id = ''.join(item.xpath('./div/div[2]/a[1]/text()')).strip() img = ''.join(item.xpath('./a/img/@src')).strip() item = { 'title': title, 'href': href, 'peiliao': peiliao, 'rate': rate, 'id': id, 'img': img, 'caipu':caipu } # print(item) try: with open('../外国食谱.csv', 'a', encoding='utf_8_sig', newline='') as fp: # 'a'为追加模式(添加) # utf_8_sig格式导出csv不乱码 fieldnames = ['title','href', 'peiliao', 'rate', 'id','img','caipu'] writer = csv.DictWriter(fp, fieldnames) writer.writerow(item) except Exception: print(traceback.print_exc()) # 代替print e 来输出详细的异常信息 except Exception: print(traceback.print_exc()) # 主函数 def main(x): url = 'https://www.douguo.com/caipu/{}/0/{}'.format(caipu,x*20) print(url) html = get_page(url) parse_page(html,caipu) if __name__ == '__main__': caipu_list = ['韩国', '日本料理','法国','意大利餐'] #中国菜系 start = time.time() # 计时 for caipu in caipu_list: for i in range(40): # 爬取多页 main(x=i) time.sleep(random.uniform(1, 2)) print(caipu,"第" + str(i+1) + "页提取完成") end = time.time() print('共用时',round((end - start) / 60, 2), '分钟')