You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

168 lines
6.6 KiB

import copy
import re
import requests
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
import sql
class Book:
def __init__(self, name, url, star, star_people, author, translater, publisher, pub_year, price, comment):
self.name = name
self.url = url
self.star = star
self.star_people = star_people
self.author = author
self.translater = translater
self.publisher = publisher
self.pub_year = pub_year
self.price = price
self.comment = comment
def to_dict(self):
return {
'书名': self.name,
'豆瓣链接': self.url,
'作者': self.author,
'译者': self.translater,
'出版社': self.publisher,
'出版日期': self.pub_year,
'价格': self.price,
'评分': self.star,
'评分人数': self.star_people,
'一句话评价': self.comment
}
def __str__(self):
return f"Book Info: {self.name} - {self.author} - {self.pub_year} - {self.publisher} - {self.price} - {self.star} - {self.star_people} - {self.comment}"
class DoubanBookTop250Crawler:
def __init__(self):
self.book_list = []
self.book_list_data=[]
def get_book_info(self, url, headers):
res = requests.get(url, headers=headers)
soup = BeautifulSoup(res.text, 'html.parser')
for book in soup.select('.item'):
name = book.select('.pl2 a')[0]['title'] # 书名
url = book.select('.pl2 a')[0]['href'] # 书籍链接
star = book.select('.rating_nums')[0].text # 书籍评分
star_people = book.select('.pl')[1].text # 评分人数
# 提取其他字段
info = book.select('.pl')[0].text.split('/')
if len(info) == 5: # 正常情况
author = info[0]
translater = info[1]
publisher = info[2]
pub_year = info[3]
price = info[4]
elif len(info) == 4: # 没有译者
author = info[0]
translater = None
publisher = info[1]
pub_year = info[2]
price = info[3]
elif len(info) == 6: # 有2个价格
author = info[0]
translater = info[1]
publisher = info[2]
pub_year = info[3]
price = str(info[4]) + '/' + str(info[5])
elif len(info) == 3: # 没有作者,且没有译者
author = None
translater = None
publisher = info[0]
pub_year = info[1]
price = str(info[2])
else:
# 这里可以加入错误处理逻辑,比如打印错误信息
print(f"Error: 图书信息爬取错误")
continue # 跳过当前循环
comment = book.select('.quote span')[0].text if book.select('.quote span') else None # 一句话评价
book_comment = book.select('.quote span')[0].text if book.select('.quote span') else None # 一句话评价
book_obj = Book(name, url, star, star_people, author, translater, publisher, pub_year, price, book_comment)
self.book_list.append(book_obj.to_dict())
self.book_list_data.append(book_obj)
def save_to_csv(self, csv_name):
df = pd.DataFrame(self.book_list)
df.to_csv(csv_name, encoding='utf_8_sig', index=False)
def crawl_douban_top250(self,page):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'}
for i in range(page):
page_url = 'https://book.douban.com/top250?start={}'.format(str(i * 25))
print('开始爬取第{}页,地址是:{}'.format(str(i + 1), page_url))
self.get_book_info(page_url, headers)
sleep(1)
self.save_to_csv(csv_name="BookDouban250.csv")
# 实例化爬虫对象,并调用方法执行爬取和保存数据
def main(page):
crawler = DoubanBookTop250Crawler()
crawler.crawl_douban_top250(page)
book_list = crawler.book_list
book_list_data = crawler.book_list_data
dataexcel = {"书名": [], "豆瓣链接": [], "作者": [], "译者": [], "出版社": [], "出版日期": [], "价格": [],
"评分": [], "评分人数": [], "一句话评价": []}
for book in book_list:
book['评分人数'] = book['评分人数'].replace('\n', '').strip()
book['评分人数'] = book['评分人数'].replace(' ', '').replace("(", "").replace(")", "").strip()
dataexcel['书名'].append(book['书名'])
dataexcel['豆瓣链接'].append(book['豆瓣链接'])
dataexcel['作者'].append(book['作者'])
dataexcel['译者'].append(book['译者'])
dataexcel['出版社'].append(book['出版社'])
dataexcel['出版日期'].append(book['出版日期'])
dataexcel['价格'].append(book['价格'])
dataexcel['评分'].append(book['评分'])
dataexcel['评分人数'].append(book['评分人数'])
dataexcel['一句话评价'].append(book['一句话评价'])
book_list_data_two_price = []
for book in book_list_data:
book.star_people = book.star_people.replace('\n', '').strip()
book.star_people = book.star_people.replace(' ', '').replace("(", "").replace(")", "").strip()
book.star_people = ''.join(filter(str.isdigit, book.star_people))
book.price = book.price.replace('', '').strip()
if book.price.find('/') != -1:
copy_price = book.price.split('/')[1]
book.price = book.price.split('/')[0]
match = re.search(r'\d+\.\d{2}', copy_price)
if match:
copy_price = match.group()
copy_book = copy.deepcopy(book)
copy_book.price = copy_price
book_list_data_two_price.append(copy_book)
match = re.search(r'\d+\.\d{2}', book.price)
if match:
book.price = match.group()
book_list_data = book_list_data + book_list_data_two_price
print(crawler.book_list)
# 将 book_list 转换为 DataFrame
df = pd.DataFrame(dataexcel)
# 将 DataFrame 写入到 Excel 文件
file_name = 'book_data.xlsx'
df.to_excel(file_name, index=False)
print(f"书籍数据已写入到 {file_name}")
# 使用BookDatabase类
db = sql.BookDatabase(host='localhost', user='root', password='123456', database='xiaosuo', table_name='books')
db.initialize_table()
db.insert_books(book_list_data)