|
|
|
@ -0,0 +1,154 @@
|
|
|
|
|
import json
|
|
|
|
|
|
|
|
|
|
import parsel
|
|
|
|
|
import requests
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def findNovelContent(url, name):
|
|
|
|
|
headers = {
|
|
|
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
|
|
|
|
|
'Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41'
|
|
|
|
|
} # 伪装成浏览器防止被反爬虫
|
|
|
|
|
response = requests.get(url=url, headers=headers).text # 获取网页源码
|
|
|
|
|
soup = BeautifulSoup(response, 'lxml') # 创建beautifulsoup对象,便于下面的解析操作
|
|
|
|
|
li_list = soup.select('.ml_list>ul>li') # 获取class='ml_list'下ul标签中所有li标签的内容
|
|
|
|
|
fp = open(name + '.txt', 'w', encoding='UTF-8') # 以写的方式创建一个文本文件编码方式为utf-8
|
|
|
|
|
for li in li_list:
|
|
|
|
|
title = li.a.string # 获取章节标题:通过网页源码可知标题在a标签中的string中
|
|
|
|
|
detail_url = 'https://www.1234u.net' + li.a['href'] # href存放进一步网址,将网页的前半部分与章节的href连接起来,即可获取完整的章节网址
|
|
|
|
|
detail_page_text = requests.get(url=detail_url, headers=headers).text # 获取章节内容的网页源码
|
|
|
|
|
detail_soup = BeautifulSoup(detail_page_text, 'lxml') # 创建beautifulsoup对象,便于解析操作
|
|
|
|
|
div_tag = detail_soup.find('p', class_="articlecontent") # 爬取章节内容:正文在class为articlecontent的p标签下
|
|
|
|
|
content = div_tag.text # 爬取章节内容:正文在class为articlecontent的p标签下,将其中的text内容保留下来
|
|
|
|
|
fp.write(title + ':' + content + '\n') # 将内容写入文件
|
|
|
|
|
print(title, '爬取成功!!!')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def findNovelPrice(item):
|
|
|
|
|
# for item in iter(book_list): # 逐元素遍历book_list
|
|
|
|
|
ls = [] # 新建空列表,用来保存之后存入的数据
|
|
|
|
|
print("开始爬取{0}的信息".format(item))
|
|
|
|
|
for page in range(1, 99):
|
|
|
|
|
print(f'========================正在保存第{page}页数据内容===================================')
|
|
|
|
|
url = f'https://search.dangdang.com/?key={item}&act=input&page_index={page}' # item为书名,page为页数
|
|
|
|
|
headers = {
|
|
|
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
|
|
|
|
|
'Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41'} # 请求头信息
|
|
|
|
|
response = requests.get(url=url, headers=headers) # 向服务器发送请求
|
|
|
|
|
selector = parsel.Selector(response.text) # 初始化parsel对象
|
|
|
|
|
lis = selector.css('.bigimg li') # 获取到所有的li标签
|
|
|
|
|
for li in lis: # 从li标签中提取到书籍所有的信息
|
|
|
|
|
title = li.css('.name a::attr(title)').get() # 标题/书名
|
|
|
|
|
price_n = li.css('.price .search_now_price::text').get() # 售价
|
|
|
|
|
picture = li.css('.pic img::attr(data-original)').get() # 图片地址
|
|
|
|
|
dit = {
|
|
|
|
|
'标题': title,
|
|
|
|
|
'售价': price_n,
|
|
|
|
|
'图片地址': picture,
|
|
|
|
|
} # 创建字典存放获取信息
|
|
|
|
|
ls.append(dit) # 将该字典嵌套存放在ls列表中
|
|
|
|
|
file_name = str(item) + '.json' # 将book_list中元素附带上后缀后作为文件名
|
|
|
|
|
fw = open(file_name, "w", encoding='utf-8')
|
|
|
|
|
a = json.dumps(ls[1:], sort_keys=False, indent=4, ensure_ascii=False) # 转换为json对象
|
|
|
|
|
fw.write(a) # 将json对象写入文件
|
|
|
|
|
fw.close() # 关闭文件
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def findNovelComment(novelname, url):
|
|
|
|
|
# 伪装成浏览器防止被反爬虫
|
|
|
|
|
headers = {
|
|
|
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
|
|
|
"Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41"
|
|
|
|
|
}
|
|
|
|
|
# 获取评论的地址
|
|
|
|
|
res = requests.get(url=url, headers=headers)
|
|
|
|
|
res = BeautifulSoup(res.text, "lxml") # 将爬取的数据改成lxml的格式
|
|
|
|
|
|
|
|
|
|
comments = res.find("div", id="comments")
|
|
|
|
|
p_list = comments.find_all("p") # 将评论所在的p标签全部存入列表中,方便下面读取操作
|
|
|
|
|
|
|
|
|
|
# 以下操作将读取到的评论写入的txt文本中
|
|
|
|
|
data_list = [] # 创建空列表用来放评论
|
|
|
|
|
for i in p_list:
|
|
|
|
|
data_list.append(i.find("span").string) # 1.评论在p下的span标签中,将内容先存入列表中
|
|
|
|
|
novelname1 = f"{novelname} 的豆瓣评论"
|
|
|
|
|
n=0
|
|
|
|
|
with open(novelname1+".txt", "w", encoding='utf-8')as f:
|
|
|
|
|
for i in data_list: # 将列表内容写入文本文件
|
|
|
|
|
n=int(n)
|
|
|
|
|
n=n+1
|
|
|
|
|
n=str(n)
|
|
|
|
|
f.write("(" + n + ")" + i + "\n") # 将每段评论换行写入
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__' :
|
|
|
|
|
|
|
|
|
|
select = input("请您选择进行下列爬取内容:(1)四大名著小说;(2)四大名著价格信息;(3)四大名著豆瓣评论:")
|
|
|
|
|
select = int(select)
|
|
|
|
|
if select == 1:
|
|
|
|
|
print("#" * 20)
|
|
|
|
|
# 小说内容
|
|
|
|
|
i=0
|
|
|
|
|
book = input("请输入您想查询的四大名著:")
|
|
|
|
|
param = {'三国演义': 'https://www.1234u.net/xiaoshuo/121/121220/',
|
|
|
|
|
'水浒传': 'https://www.1234u.net/141/141647/',
|
|
|
|
|
'红楼梦': 'https://www.1234u.net/192/192648/',
|
|
|
|
|
'西游记': '2https://www.1234u.net/157/157874/'}
|
|
|
|
|
for a, b in param.items():
|
|
|
|
|
if a == book:
|
|
|
|
|
findNovelContent(b, a)
|
|
|
|
|
break
|
|
|
|
|
else:
|
|
|
|
|
i=i+1
|
|
|
|
|
if i >= 4:
|
|
|
|
|
print("您输入的不是四大名著!")
|
|
|
|
|
break
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
elif select == 2:
|
|
|
|
|
print("#" * 20)
|
|
|
|
|
i = 0
|
|
|
|
|
book = input("请输入您想查询的四大名著:")
|
|
|
|
|
# 小说价格,名称,图片地址
|
|
|
|
|
book_list = ['三国演义', '红楼梦', '水浒传', '西游记']
|
|
|
|
|
for item in book_list:
|
|
|
|
|
if item == book :
|
|
|
|
|
findNovelPrice(item)
|
|
|
|
|
break
|
|
|
|
|
else:
|
|
|
|
|
i=i+1
|
|
|
|
|
if i >= 4:
|
|
|
|
|
print("您输入的不是四大名著!")
|
|
|
|
|
break
|
|
|
|
|
continue
|
|
|
|
|
print("价格爬取结束!")
|
|
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
# 爬小说书评
|
|
|
|
|
print("#" * 20)
|
|
|
|
|
i = 0
|
|
|
|
|
book = input("请输入您想查询的四大名著:")
|
|
|
|
|
param1 = {
|
|
|
|
|
'西游记': "https://book.douban.com/subject/30137808/comments/?start=0&limit=20&status=P&sort"
|
|
|
|
|
"=new_score",
|
|
|
|
|
'红楼梦': "https://book.douban.com/subject/1007305/comments/?start=0&limit=20&status=P&sort"
|
|
|
|
|
"=new_score",
|
|
|
|
|
'水浒传': "https://book.douban.com/subject/30137810/comments/?start=0&limit=20&status=P&sort"
|
|
|
|
|
"=new_score",
|
|
|
|
|
'三国演义': "https://book.douban.com/subject/26416768/comments/?start=0&limit=20&status=P&sort"
|
|
|
|
|
"=new_score"}
|
|
|
|
|
for name, url in param1.items():
|
|
|
|
|
if name == book:
|
|
|
|
|
findNovelComment(name, url)
|
|
|
|
|
print("评语爬取成功!!")
|
|
|
|
|
break
|
|
|
|
|
else:
|
|
|
|
|
i=i+1
|
|
|
|
|
if i >= 4:
|
|
|
|
|
print("您输入的不是四大名著!")
|
|
|
|
|
break
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|