You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
168 lines
6.6 KiB
168 lines
6.6 KiB
import copy
|
|
import re
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
import pandas as pd
|
|
from time import sleep
|
|
import sql
|
|
|
|
|
|
class Book:
|
|
def __init__(self, name, url, star, star_people, author, translater, publisher, pub_year, price, comment):
|
|
self.name = name
|
|
self.url = url
|
|
self.star = star
|
|
self.star_people = star_people
|
|
self.author = author
|
|
self.translater = translater
|
|
self.publisher = publisher
|
|
self.pub_year = pub_year
|
|
self.price = price
|
|
self.comment = comment
|
|
|
|
def to_dict(self):
|
|
return {
|
|
'书名': self.name,
|
|
'豆瓣链接': self.url,
|
|
'作者': self.author,
|
|
'译者': self.translater,
|
|
'出版社': self.publisher,
|
|
'出版日期': self.pub_year,
|
|
'价格': self.price,
|
|
'评分': self.star,
|
|
'评分人数': self.star_people,
|
|
'一句话评价': self.comment
|
|
}
|
|
|
|
def __str__(self):
|
|
return f"Book Info: {self.name} - {self.author} - {self.pub_year} - {self.publisher} - {self.price} - {self.star} - {self.star_people} - {self.comment}"
|
|
|
|
|
|
class DoubanBookTop250Crawler:
|
|
def __init__(self):
|
|
self.book_list = []
|
|
self.book_list_data=[]
|
|
|
|
def get_book_info(self, url, headers):
|
|
res = requests.get(url, headers=headers)
|
|
soup = BeautifulSoup(res.text, 'html.parser')
|
|
for book in soup.select('.item'):
|
|
name = book.select('.pl2 a')[0]['title'] # 书名
|
|
url = book.select('.pl2 a')[0]['href'] # 书籍链接
|
|
star = book.select('.rating_nums')[0].text # 书籍评分
|
|
star_people = book.select('.pl')[1].text # 评分人数
|
|
# 提取其他字段
|
|
info = book.select('.pl')[0].text.split('/')
|
|
if len(info) == 5: # 正常情况
|
|
author = info[0]
|
|
translater = info[1]
|
|
publisher = info[2]
|
|
pub_year = info[3]
|
|
price = info[4]
|
|
elif len(info) == 4: # 没有译者
|
|
author = info[0]
|
|
translater = None
|
|
publisher = info[1]
|
|
pub_year = info[2]
|
|
price = info[3]
|
|
elif len(info) == 6: # 有2个价格
|
|
author = info[0]
|
|
translater = info[1]
|
|
publisher = info[2]
|
|
pub_year = info[3]
|
|
price = str(info[4]) + '/' + str(info[5])
|
|
elif len(info) == 3: # 没有作者,且没有译者
|
|
author = None
|
|
translater = None
|
|
publisher = info[0]
|
|
pub_year = info[1]
|
|
price = str(info[2])
|
|
else:
|
|
# 这里可以加入错误处理逻辑,比如打印错误信息
|
|
print(f"Error: 图书信息爬取错误")
|
|
continue # 跳过当前循环
|
|
comment = book.select('.quote span')[0].text if book.select('.quote span') else None # 一句话评价
|
|
book_comment = book.select('.quote span')[0].text if book.select('.quote span') else None # 一句话评价
|
|
book_obj = Book(name, url, star, star_people, author, translater, publisher, pub_year, price, book_comment)
|
|
self.book_list.append(book_obj.to_dict())
|
|
self.book_list_data.append(book_obj)
|
|
|
|
|
|
def save_to_csv(self, csv_name):
|
|
df = pd.DataFrame(self.book_list)
|
|
df.to_csv(csv_name, encoding='utf_8_sig', index=False)
|
|
|
|
def crawl_douban_top250(self,page):
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'}
|
|
for i in range(page):
|
|
page_url = 'https://book.douban.com/top250?start={}'.format(str(i * 25))
|
|
print('开始爬取第{}页,地址是:{}'.format(str(i + 1), page_url))
|
|
self.get_book_info(page_url, headers)
|
|
sleep(1)
|
|
self.save_to_csv(csv_name="BookDouban250.csv")
|
|
|
|
# 实例化爬虫对象,并调用方法执行爬取和保存数据
|
|
def main(page):
|
|
crawler = DoubanBookTop250Crawler()
|
|
crawler.crawl_douban_top250(page)
|
|
book_list = crawler.book_list
|
|
book_list_data = crawler.book_list_data
|
|
dataexcel = {"书名": [], "豆瓣链接": [], "作者": [], "译者": [], "出版社": [], "出版日期": [], "价格": [],
|
|
"评分": [], "评分人数": [], "一句话评价": []}
|
|
for book in book_list:
|
|
book['评分人数'] = book['评分人数'].replace('\n', '').strip()
|
|
book['评分人数'] = book['评分人数'].replace(' ', '').replace("(", "").replace(")", "").strip()
|
|
dataexcel['书名'].append(book['书名'])
|
|
dataexcel['豆瓣链接'].append(book['豆瓣链接'])
|
|
dataexcel['作者'].append(book['作者'])
|
|
dataexcel['译者'].append(book['译者'])
|
|
dataexcel['出版社'].append(book['出版社'])
|
|
dataexcel['出版日期'].append(book['出版日期'])
|
|
dataexcel['价格'].append(book['价格'])
|
|
dataexcel['评分'].append(book['评分'])
|
|
dataexcel['评分人数'].append(book['评分人数'])
|
|
dataexcel['一句话评价'].append(book['一句话评价'])
|
|
book_list_data_two_price = []
|
|
for book in book_list_data:
|
|
book.star_people = book.star_people.replace('\n', '').strip()
|
|
book.star_people = book.star_people.replace(' ', '').replace("(", "").replace(")", "").strip()
|
|
book.star_people = ''.join(filter(str.isdigit, book.star_people))
|
|
book.price = book.price.replace('元', '').strip()
|
|
if book.price.find('/') != -1:
|
|
copy_price = book.price.split('/')[1]
|
|
book.price = book.price.split('/')[0]
|
|
match = re.search(r'\d+\.\d{2}', copy_price)
|
|
if match:
|
|
copy_price = match.group()
|
|
copy_book = copy.deepcopy(book)
|
|
copy_book.price = copy_price
|
|
book_list_data_two_price.append(copy_book)
|
|
match = re.search(r'\d+\.\d{2}', book.price)
|
|
if match:
|
|
book.price = match.group()
|
|
book_list_data = book_list_data + book_list_data_two_price
|
|
|
|
print(crawler.book_list)
|
|
# 将 book_list 转换为 DataFrame
|
|
df = pd.DataFrame(dataexcel)
|
|
# 将 DataFrame 写入到 Excel 文件
|
|
file_name = 'book_data.xlsx'
|
|
df.to_excel(file_name, index=False)
|
|
print(f"书籍数据已写入到 {file_name}")
|
|
|
|
# 使用BookDatabase类
|
|
db = sql.BookDatabase(host='localhost', user='root', password='123456', database='xiaosuo', table_name='books')
|
|
db.initialize_table()
|
|
db.insert_books(book_list_data)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|