You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
book/11_DouBan250Spider.py

100 lines
3.5 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import requests # 发送请求
from bs4 import BeautifulSoup # 解析网页
import pandas as pd # 存取csv
from time import sleep # 等待时间
book_name = [] # 书名
book_url = [] # 书籍链接
book_star = [] # 书籍评分
book_star_people = [] # 评分人数
book_author = [] # 书籍作者
book_translater = [] # 书籍译者
book_publisher = [] # 出版社
book_pub_year = [] # 出版日期
book_price = [] # 书籍价格
book_comment = [] # 一句话评价
def get_book_info(url, headers):
res = requests.get(url, headers=headers)
soup = BeautifulSoup(res.text, 'html.parser')
for book in soup.select('.item'):
name = book.select('.pl2 a')[0]['title'] # 书名
book_name.append(name)
bkurl = book.select('.pl2 a')[0]['href'] # 书籍链接
book_url.append(bkurl)
star = book.select('.rating_nums')[0].text # 书籍评分
book_star.append(star)
star_people = book.select('.pl')[1].text # 评分人数
star_people = star_people.strip().replace(' ', '').replace('人评价', '').replace('(\n', '').replace('\n)',
'') # 数据清洗
book_star_people.append(star_people)
# 没有一句话评价,比如倒数第二名,君主论
if book.select('.quote span'):
book_comment.append(book.select('.quote span')[0].text)
else:
book_comment.append(None)
info = book.select('.pl')[0].text.split('/')
if len(info) == 5: # 正常情况
book_author.append(info[0])
book_translater.append(info[1])
book_publisher.append(info[2])
book_pub_year.append(info[3])
book_price.append(str(info[4]))
elif len(info) == 4: # 没有译者,比如:第一名,红楼梦
book_author.append(info[0])
book_translater.append(None)
book_publisher.append(info[1])
book_pub_year.append(info[2])
book_price.append(str(info[3]))
elif len(info) == 6: # 有2个价格比如第一页福尔摩斯探案全集上中下
book_author.append(info[0])
book_translater.append(info[1])
book_publisher.append(info[2])
book_pub_year.append(info[3])
book_price.append(str(info[4]) + '/' + str(info[5]))
elif len(info) == 3: # 没有作者且没有译者比如第5页十万个为什么
book_author.append(None)
book_translater.append(None)
book_publisher.append(info[0])
book_pub_year.append(info[1])
book_price.append(str(info[2]))
else:
pass
def save_to_csv(csv_name):
"""
数据保存到csv
:return: None
"""
df = pd.DataFrame() # 初始化一个DataFrame对象
df['书名'] = book_name
df['豆瓣链接'] = book_url
df['作者'] = book_author
df['译者'] = book_translater
df['出版社'] = book_publisher
df['出版日期'] = book_pub_year
df['价格'] = book_price
df['评分'] = book_star
df['评分人数'] = book_star_people
df['一句话评价'] = book_comment
df.to_csv(csv_name, encoding='utf_8_sig') # 将数据保存到csv文件
if __name__ == "__main__":
# 定义一个请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'}
# 开始爬取豆瓣数据
for i in range(1): # 爬取共10页每页25条数据
page_url = 'https://book.douban.com/top250?start={}'.format(str(i * 25))
print('开始爬取第{}页,地址是:{}'.format(str(i + 1), page_url))
get_book_info(page_url, headers)
sleep(1) # 等待1秒
# 保存到csv文件
save_to_csv(csv_name="BookDouban250.csv")