You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

155 lines
7.4 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import json
import parsel
import requests
from bs4 import BeautifulSoup
def findNovelContent(url, name):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41'
} # 伪装成浏览器防止被反爬虫
response = requests.get(url=url, headers=headers).text # 获取网页源码
soup = BeautifulSoup(response, 'lxml') # 创建beautifulsoup对象便于下面的解析操作
li_list = soup.select('.ml_list>ul>li') # 获取class='ml_list'下ul标签中所有li标签的内容
fp = open(name + '.txt', 'w', encoding='UTF-8') # 以写的方式创建一个文本文件编码方式为utf-8
for li in li_list:
title = li.a.string # 获取章节标题通过网页源码可知标题在a标签中的string中
detail_url = 'https://www.1234u.net' + li.a['href'] # href存放进一步网址将网页的前半部分与章节的href连接起来即可获取完整的章节网址
detail_page_text = requests.get(url=detail_url, headers=headers).text # 获取章节内容的网页源码
detail_soup = BeautifulSoup(detail_page_text, 'lxml') # 创建beautifulsoup对象便于解析操作
div_tag = detail_soup.find('p', class_="articlecontent") # 爬取章节内容正文在class为articlecontent的p标签下
content = div_tag.text # 爬取章节内容正文在class为articlecontent的p标签下将其中的text内容保留下来
fp.write(title + ':' + content + '\n') # 将内容写入文件
print(title, '爬取成功!!!')
def findNovelPrice(item):
# for item in iter(book_list): # 逐元素遍历book_list
ls = [] # 新建空列表,用来保存之后存入的数据
print("开始爬取{0}的信息".format(item))
for page in range(1, 99):
print(f'========================正在保存第{page}页数据内容===================================')
url = f'https://search.dangdang.com/?key={item}&act=input&page_index={page}' # item为书名page为页数
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41'} # 请求头信息
response = requests.get(url=url, headers=headers) # 向服务器发送请求
selector = parsel.Selector(response.text) # 初始化parsel对象
lis = selector.css('.bigimg li') # 获取到所有的li标签
for li in lis: # 从li标签中提取到书籍所有的信息
title = li.css('.name a::attr(title)').get() # 标题/书名
price_n = li.css('.price .search_now_price::text').get() # 售价
picture = li.css('.pic img::attr(data-original)').get() # 图片地址
dit = {
'标题': title,
'售价': price_n,
'图片地址': picture,
} # 创建字典存放获取信息
ls.append(dit) # 将该字典嵌套存放在ls列表中
file_name = str(item) + '.json' # 将book_list中元素附带上后缀后作为文件名
fw = open(file_name, "w", encoding='utf-8')
a = json.dumps(ls[1:], sort_keys=False, indent=4, ensure_ascii=False) # 转换为json对象
fw.write(a) # 将json对象写入文件
fw.close() # 关闭文件
def findNovelComment(novelname, url):
# 伪装成浏览器防止被反爬虫
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41"
}
# 获取评论的地址
res = requests.get(url=url, headers=headers)
res = BeautifulSoup(res.text, "lxml") # 将爬取的数据改成lxml的格式
comments = res.find("div", id="comments")
p_list = comments.find_all("p") # 将评论所在的p标签全部存入列表中方便下面读取操作
# 以下操作将读取到的评论写入的txt文本中
data_list = [] # 创建空列表用来放评论
for i in p_list:
data_list.append(i.find("span").string) # 1.评论在p下的span标签中将内容先存入列表中
novelname1 = f"{novelname} 的豆瓣评论"
n=0
with open(novelname1+".txt", "w", encoding='utf-8')as f:
for i in data_list: # 将列表内容写入文本文件
n=int(n)
n=n+1
n=str(n)
f.write("(" + n + ")" + i + "\n") # 将每段评论换行写入
if __name__ == '__main__' :
select = input("请您选择进行下列爬取内容1四大名著小说2四大名著价格信息3四大名著豆瓣评论")
select = int(select)
if select == 1:
print("#" * 20)
# 小说内容
i=0
book = input("请输入您想查询的四大名著:")
param = {'三国演义': 'https://www.1234u.net/xiaoshuo/121/121220/',
'水浒传': 'https://www.1234u.net/141/141647/',
'红楼梦': 'https://www.1234u.net/192/192648/',
'西游记': '2https://www.1234u.net/157/157874/'}
for a, b in param.items():
if a == book:
findNovelContent(b, a)
break
else:
i=i+1
if i >= 4:
print("您输入的不是四大名著!")
break
continue
elif select == 2:
print("#" * 20)
i = 0
book = input("请输入您想查询的四大名著:")
# 小说价格,名称,图片地址
book_list = ['三国演义', '红楼梦', '水浒传', '西游记']
for item in book_list:
if item == book :
findNovelPrice(item)
break
else:
i=i+1
if i >= 4:
print("您输入的不是四大名著!")
break
continue
print("价格爬取结束!")
else:
# 爬小说书评
print("#" * 20)
i = 0
book = input("请输入您想查询的四大名著:")
param1 = {
'西游记': "https://book.douban.com/subject/30137808/comments/?start=0&limit=20&status=P&sort"
"=new_score",
'红楼梦': "https://book.douban.com/subject/1007305/comments/?start=0&limit=20&status=P&sort"
"=new_score",
'水浒传': "https://book.douban.com/subject/30137810/comments/?start=0&limit=20&status=P&sort"
"=new_score",
'三国演义': "https://book.douban.com/subject/26416768/comments/?start=0&limit=20&status=P&sort"
"=new_score"}
for name, url in param1.items():
if name == book:
findNovelComment(name, url)
print("评语爬取成功!!")
break
else:
i=i+1
if i >= 4:
print("您输入的不是四大名著!")
break
continue