|
|
4 years ago | |
|---|---|---|
| README.md | 4 years ago | |
README.md
from email.encoders import encode_noop import json import parsel import requests from bs4 import BeautifulSoup
def findNovelContent(url, name): headers = { 'User-Agent': 'MMozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/100.0.4896.127 Safari/537.36 Edg/100.0.1185.44 ' } # 伪装成浏览器防止被反爬虫 response = requests.get(url=url, headers=headers).text # 获取网页源码 soup = BeautifulSoup(response, 'lxml') # 创建beautifulsoup对象 #print(soup) li_list = soup.select('.ml_list>ul>li') # 获取class='ml_list'下ul标签中所有li标签的内容 fp = open(name + '.txt', 'w', encoding='UTF-8') # 以写的方式创建一个文本文件编码方式为utf-8 for li in li_list: title = li.string # 获取章节标题 detail_url = 'https://www.1234u.net' + li.a['href'] # 将网页的前半部分与章节的href连接起来 detail_page_text = requests.get(url=detail_url, headers=headers).text # 获取章节内容的网页源码 detail_soup = BeautifulSoup(detail_page_text, 'lxml') div_tag = detail_soup.find('p', class_="articlecontent") # 爬取章节内容 content = div_tag.text fp.write(title + ':' + content + '\n') # 将内容写入文件 print(title, '爬取成功!!!')
def findNovelPrice(book_list): for item in iter(book_list): # 逐元素遍历book_list ls = [] # 新建空列表,用来保存之后存入的数据 print("开始爬取{0}的信息".format(item)) for page in range(1, 99): print(f'========================正在保存第{page}页数据内容===================================') url = f'https://search.dangdang.com/?key={item}&act=input&page_index={page}' # item为书名,page为页数 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' 'Chrome/101.0.4951.41 Safari/537.36 Edg/101.0.1210.32' } # 请求头信息 response = requests.get(url=url, headers=headers) # 向服务器发送请求 selector = parsel.Selector(response.text) # 初始化parsel对象
lis = selector.css('.bigimg li') # 获取到所有的li标签
for li in lis: # 从li标签中提取到书籍所有的信息
title = li.css('.name a::attr(title)').get() # 标题/书名
price_n = li.css('.price .search_now_price::text').get() # 售价
dit = {
'标题': title,
'售价': price_n,
}
ls.append(dit)
file_name = str(item) + '.json' # 将book_list中元素附带上后缀后作为文件名
fw = open(file_name, "w", encoding='utf-8')
a = json.dumps(ls[1:], sort_keys=False, indent=4, ensure_ascii=False) # 转换为json对象
fw.write(a) # 将json对象写入文件
fw.close() # 关闭文件
def findNovelComment(novelname, url): # 伪装成浏览器防止被反爬虫 headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/84.0.4147.89 Safari/537.36 SLBrowser/7.0.0.12151 SLBChan/11 " } # 获取评论的地址 res = requests.get(url=url, headers=headers) res = BeautifulSoup(res.text, "lxml") # 将爬取的数据改成lxml的格式
comments = res.find("div", id="comments")
print(comments)
p_list = comments.find_all("p")
# 将读取到的评论写入的txt文本中
data_list = [] # 将内容先存入列表中
for i in p_list:
data_list.append(i.find("span").string)
with open("三国演义短评.txt", "a",errors='ignore')as f:
f.write(novelname + '\n')
for i in data_list: # 将列表内容写入文本文件
f.write(i + "\n")# 将每段评论换行写入
if name == 'main': # 小说内容参数 param = {'三国演义': 'https://www.1234u.net/xiaoshuo/121/121220/'} for a, b in param.items(): findNovelContent(b, a)
#小说价格,名称
book_list = ['三国演义']
findNovelPrice(book_list)
# 小说书评
param1 = {
'\n--------三国演义--------\n': "https://book.douban.com/subject/26416768/comments/?start=0&limit=20&status=P&sort"
"=new_score"
}
for name, url in param1.items():
findNovelComment(name, url)
print("评语爬取成功!!")