You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

101 lines
4.6 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

from email.encoders import encode_noop
import json
import parsel
import requests
from bs4 import BeautifulSoup
def findNovelContent(url, name):
headers = {
'User-Agent': 'MMozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/100.0.4896.127 Safari/537.36 Edg/100.0.1185.44 '
} # 伪装成浏览器防止被反爬虫
response = requests.get(url=url, headers=headers).text # 获取网页源码
soup = BeautifulSoup(response, 'lxml') # 创建beautifulsoup对象
#print(soup)
li_list = soup.select('.ml_list>ul>li') # 获取class='ml_list'下ul标签中所有li标签的内容
fp = open(name + '.txt', 'w', encoding='UTF-8') # 以写的方式创建一个文本文件编码方式为utf-8
for li in li_list:
title = li.string # 获取章节标题
detail_url = 'https://www.1234u.net' + li.a['href'] # 将网页的前半部分与章节的href连接起来
detail_page_text = requests.get(url=detail_url, headers=headers).text # 获取章节内容的网页源码
detail_soup = BeautifulSoup(detail_page_text, 'lxml')
div_tag = detail_soup.find('p', class_="articlecontent") # 爬取章节内容
content = div_tag.text
fp.write(title + ':' + content + '\n') # 将内容写入文件
print(title, '爬取成功!!!')
def findNovelPrice(book_list):
for item in iter(book_list): # 逐元素遍历book_list
ls = [] # 新建空列表,用来保存之后存入的数据
print("开始爬取{0}的信息".format(item))
for page in range(1, 99):
print(f'========================正在保存第{page}页数据内容===================================')
url = f'https://search.dangdang.com/?key={item}&act=input&page_index={page}' # item为书名page为页数
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
'Chrome/101.0.4951.41 Safari/537.36 Edg/101.0.1210.32'
} # 请求头信息
response = requests.get(url=url, headers=headers) # 向服务器发送请求
selector = parsel.Selector(response.text) # 初始化parsel对象
lis = selector.css('.bigimg li') # 获取到所有的li标签
for li in lis: # 从li标签中提取到书籍所有的信息
title = li.css('.name a::attr(title)').get() # 标题/书名
price_n = li.css('.price .search_now_price::text').get() # 售价
dit = {
'标题': title,
'售价': price_n,
}
ls.append(dit)
file_name = str(item) + '.json' # 将book_list中元素附带上后缀后作为文件名
fw = open(file_name, "w", encoding='utf-8')
a = json.dumps(ls[1:], sort_keys=False, indent=4, ensure_ascii=False) # 转换为json对象
fw.write(a) # 将json对象写入文件
fw.close() # 关闭文件
def findNovelComment(novelname, url):
# 伪装成浏览器防止被反爬虫
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/84.0.4147.89 Safari/537.36 SLBrowser/7.0.0.12151 SLBChan/11 "
}
# 获取评论的地址
res = requests.get(url=url, headers=headers)
res = BeautifulSoup(res.text, "lxml") # 将爬取的数据改成lxml的格式
comments = res.find("div", id="comments")
print(comments)
p_list = comments.find_all("p")
# 将读取到的评论写入的txt文本中
data_list = [] # 将内容先存入列表中
for i in p_list:
data_list.append(i.find("span").string)
with open("三国演义短评.txt", "a",errors='ignore')as f:
f.write(novelname + '\n')
for i in data_list: # 将列表内容写入文本文件
f.write(i + "\n")# 将每段评论换行写入
if __name__ == '__main__':
# 小说内容参数
param = {'三国演义': 'https://www.1234u.net/xiaoshuo/121/121220/'}
for a, b in param.items():
findNovelContent(b, a)
#小说价格,名称
book_list = ['三国演义']
findNovelPrice(book_list)
# 小说书评
param1 = {
'\n--------三国演义--------\n': "https://book.douban.com/subject/26416768/comments/?start=0&limit=20&status=P&sort"
"=new_score"
}
for name, url in param1.items():
findNovelComment(name, url)
print("评语爬取成功!!")