From 2912b00fdb5ff90756dfa3b61d576eab5e994861 Mon Sep 17 00:00:00 2001 From: p2f8cf6qm Date: Thu, 27 May 2021 23:20:02 +0800 Subject: [PATCH] =?UTF-8?q?=E8=B1=86=E7=93=A3=E8=AF=BB=E4=B9=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 豆瓣读书.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 豆瓣读书.py diff --git a/豆瓣读书.py b/豆瓣读书.py new file mode 100644 index 0000000..75abf55 --- /dev/null +++ b/豆瓣读书.py @@ -0,0 +1,31 @@ +from lxml import etree +import requests +import csv + +fp = open('C://Users/123/Desktop/doubanbook.csv','wt',newline='',encoding='utf-8') +writer = csv.writer(fp) +writer.writerow(('name', 'url', 'author', 'publisher', 'date', 'price', 'rate', 'comment')) + +urls = ['https://book.douban.com/top250?start={}'.format(str(i)) for i in range(0,250,25)] + +headers = { + 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36' +} + +for url in urls: + html = requests.get(url,headers=headers) + selector = etree.HTML(html.text) + infos = selector.xpath('//tr[@class="item"]') + for info in infos: + name = info.xpath('td/div/a/@title')[0] + url = info.xpath('td/div/a/@href')[0] + book_infos = info.xpath('td/p/text()')[0] + author = book_infos.split('/')[0] + date = book_infos.split('/')[-2] + price = book_infos.split('/')[-1] + rate = info.xpath('td/div/span[2]/text()')[0] + comments = info.xpath('td/p/span/text()') + comment = comments[0] if len(comments) != 0 else "空" + writer.writerow((name,url,author,date,price,rate,comment)) + +fp.close()