import copy import re import requests from bs4 import BeautifulSoup import pandas as pd from time import sleep import sql class Book: def __init__(self, name, url, star, star_people, author, translater, publisher, pub_year, price, comment): self.name = name self.url = url self.star = star self.star_people = star_people self.author = author self.translater = translater self.publisher = publisher self.pub_year = pub_year self.price = price self.comment = comment def to_dict(self): return { '书名': self.name, '豆瓣链接': self.url, '作者': self.author, '译者': self.translater, '出版社': self.publisher, '出版日期': self.pub_year, '价格': self.price, '评分': self.star, '评分人数': self.star_people, '一句话评价': self.comment } def __str__(self): return f"Book Info: {self.name} - {self.author} - {self.pub_year} - {self.publisher} - {self.price} - {self.star} - {self.star_people} - {self.comment}" class DoubanBookTop250Crawler: def __init__(self): self.book_list = [] self.book_list_data=[] def get_book_info(self, url, headers): res = requests.get(url, headers=headers) soup = BeautifulSoup(res.text, 'html.parser') for book in soup.select('.item'): name = book.select('.pl2 a')[0]['title'] # 书名 url = book.select('.pl2 a')[0]['href'] # 书籍链接 star = book.select('.rating_nums')[0].text # 书籍评分 star_people = book.select('.pl')[1].text # 评分人数 # 提取其他字段 info = book.select('.pl')[0].text.split('/') if len(info) == 5: # 正常情况 author = info[0] translater = info[1] publisher = info[2] pub_year = info[3] price = info[4] elif len(info) == 4: # 没有译者 author = info[0] translater = None publisher = info[1] pub_year = info[2] price = info[3] elif len(info) == 6: # 有2个价格 author = info[0] translater = info[1] publisher = info[2] pub_year = info[3] price = str(info[4]) + '/' + str(info[5]) elif len(info) == 3: # 没有作者,且没有译者 author = None translater = None publisher = info[0] pub_year = info[1] price = str(info[2]) else: # 这里可以加入错误处理逻辑,比如打印错误信息 print(f"Error: 图书信息爬取错误") continue # 跳过当前循环 comment = book.select('.quote span')[0].text if book.select('.quote span') else None # 一句话评价 book_comment = book.select('.quote span')[0].text if book.select('.quote span') else None # 一句话评价 book_obj = Book(name, url, star, star_people, author, translater, publisher, pub_year, price, book_comment) self.book_list.append(book_obj.to_dict()) self.book_list_data.append(book_obj) def save_to_csv(self, csv_name): df = pd.DataFrame(self.book_list) df.to_csv(csv_name, encoding='utf_8_sig', index=False) def crawl_douban_top250(self,page): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'} for i in range(page): page_url = 'https://book.douban.com/top250?start={}'.format(str(i * 25)) print('开始爬取第{}页,地址是:{}'.format(str(i + 1), page_url)) self.get_book_info(page_url, headers) sleep(1) self.save_to_csv(csv_name="BookDouban250.csv") # 实例化爬虫对象,并调用方法执行爬取和保存数据 def main(page): crawler = DoubanBookTop250Crawler() crawler.crawl_douban_top250(page) book_list = crawler.book_list book_list_data = crawler.book_list_data dataexcel = {"书名": [], "豆瓣链接": [], "作者": [], "译者": [], "出版社": [], "出版日期": [], "价格": [], "评分": [], "评分人数": [], "一句话评价": []} for book in book_list: book['评分人数'] = book['评分人数'].replace('\n', '').strip() book['评分人数'] = book['评分人数'].replace(' ', '').replace("(", "").replace(")", "").strip() dataexcel['书名'].append(book['书名']) dataexcel['豆瓣链接'].append(book['豆瓣链接']) dataexcel['作者'].append(book['作者']) dataexcel['译者'].append(book['译者']) dataexcel['出版社'].append(book['出版社']) dataexcel['出版日期'].append(book['出版日期']) dataexcel['价格'].append(book['价格']) dataexcel['评分'].append(book['评分']) dataexcel['评分人数'].append(book['评分人数']) dataexcel['一句话评价'].append(book['一句话评价']) book_list_data_two_price = [] for book in book_list_data: book.star_people = book.star_people.replace('\n', '').strip() book.star_people = book.star_people.replace(' ', '').replace("(", "").replace(")", "").strip() book.star_people = ''.join(filter(str.isdigit, book.star_people)) book.price = book.price.replace('元', '').strip() if book.price.find('/') != -1: copy_price = book.price.split('/')[1] book.price = book.price.split('/')[0] match = re.search(r'\d+\.\d{2}', copy_price) if match: copy_price = match.group() copy_book = copy.deepcopy(book) copy_book.price = copy_price book_list_data_two_price.append(copy_book) match = re.search(r'\d+\.\d{2}', book.price) if match: book.price = match.group() book_list_data = book_list_data + book_list_data_two_price print(crawler.book_list) # 将 book_list 转换为 DataFrame df = pd.DataFrame(dataexcel) # 将 DataFrame 写入到 Excel 文件 file_name = 'book_data.xlsx' df.to_excel(file_name, index=False) print(f"书籍数据已写入到 {file_name}") # 使用BookDatabase类 db = sql.BookDatabase(host='localhost', user='root', password='123456', database='xiaosuo', table_name='books') db.initialize_table() db.insert_books(book_list_data)