Compare commits
4 Commits
| Author | SHA1 | Date |
|---|---|---|
|
|
57f47d46f0 | 4 years ago |
|
|
6456a76a07 | 4 years ago |
|
|
5ba6c20afa | 4 years ago |
|
|
96b9f36017 | 4 years ago |
@ -1,10 +1,6 @@
|
||||
# 久久小说网小说爬取
|
||||
本程序可分类别下载[久久小说网](https://txt909.com/)所有的小说
|
||||
特性:
|
||||
- 本程序使用**requests**和**BeautifulSoup**实现网页访问与解析
|
||||
- 可通过变量`category`修改下载的小说分类
|
||||
- 下载的小说分别存放在对应分类下的目录中
|
||||
- 可通过变量`start_page`和`end_page`来指定下载的页数范围
|
||||
- 会自行判断是否已下载过某篇小说避免重复下载
|
||||
- 每下载一篇小说会在终端输出相应信息
|
||||
- 更多特性请自行探索
|
||||
# Spider
|
||||
本项目包含各种网站的爬虫,持续更新,~~时间少,更新慢~~
|
||||
项目列表~~(包括画的大饼)~~
|
||||
- [x] [久久小说网小说爬取](%E4%B9%85%E4%B9%85%E5%B0%8F%E8%AF%B4%E7%BD%91)
|
||||
- [ ] 必应每日壁纸爬取
|
||||
- [ ] B站视频爬取
|
||||
@ -1,58 +0,0 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import os
|
||||
|
||||
|
||||
def get_download_link(url, page):
|
||||
'''返回小说的名字和下载链接'''
|
||||
resp = requests.get(f'{url}{page}.html', headers=headers)
|
||||
html = resp.content.decode('utf-8')
|
||||
soup = BeautifulSoup(html, 'lxml')
|
||||
div = soup.find_all('div', attrs={'class': 'listbg'})
|
||||
novel_dic = {a.find_all('a')[0].attrs['title']: a.find_all('a')[
|
||||
0].attrs['href'][5:-5] for a in div}
|
||||
return novel_dic
|
||||
|
||||
|
||||
def download(name, href, page, x):
|
||||
'''下载小说'''
|
||||
name = name.replace('?', '')
|
||||
if f'{name}.txt' in os.listdir(category):
|
||||
print(f'{name} 已存在')
|
||||
return
|
||||
link = f'http://www.vbiquge.co/api/txt_down.php?articleid={href}&articlename={name}'
|
||||
resp = requests.get(link, headers=headers).text
|
||||
with open(f'{category}/{name}.txt', 'w', encoding='utf-8') as f:
|
||||
f.write(resp)
|
||||
print(f'正在下载第{page}页第{x}篇:{name}')
|
||||
|
||||
|
||||
headers = {
|
||||
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36'
|
||||
}
|
||||
categories = {
|
||||
'穿越小说': 'https://www.txt909.com/html/chuanyue/',
|
||||
'言情小说': 'https://www.txt909.com/html/yanqing/',
|
||||
'现代都市': 'https://www.txt909.com/html/dushi/',
|
||||
'耽美百合': 'https://www.txt909.com/html/baihe/',
|
||||
'历史架空': 'https://www.txt909.com/html/lishi/',
|
||||
'美文同人': 'https://www.txt909.com/html/tongren/',
|
||||
'武侠仙侠': 'https://www.txt909.com/html/wuxia/',
|
||||
'玄幻小说': 'https://www.txt909.com/html/xuanhuan/',
|
||||
'惊悚悬疑': 'https://www.txt909.com/html/jingsong/',
|
||||
'科幻小说': 'https://www.txt909.com/html/kehuan/',
|
||||
'网游竞技': 'https://www.txt909.com/html/wangyou/'}
|
||||
category = '科幻小说'
|
||||
start_page = 1
|
||||
end_page = 200
|
||||
|
||||
if categories.get(category, 0):
|
||||
if not os.path.exists('./'+category):
|
||||
os.mkdir('./'+category)
|
||||
|
||||
for page in range(start_page, end_page):
|
||||
novel_dic = get_download_link(categories[category], page)
|
||||
x = 1
|
||||
for name, href in novel_dic.items():
|
||||
download(name, href, page, x)
|
||||
x += 1
|
||||
@ -1,31 +0,0 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import os
|
||||
|
||||
headers = {
|
||||
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36'
|
||||
}
|
||||
|
||||
category = '武侠'
|
||||
url = 'https://www.txt909.com/html/wuxia/'
|
||||
for i in range(1, 208):
|
||||
|
||||
resp = requests.get(f'{url}{i}.html', headers=headers)
|
||||
html = resp.content.decode('utf-8')
|
||||
soup = BeautifulSoup(html, 'lxml')
|
||||
div = soup.find_all('div', attrs={'class': 'listbg'})
|
||||
novel_pages = {a.find_all('a')[0].attrs['title']: a.find_all('a')[
|
||||
0].attrs['href'][5:-5] for a in div}
|
||||
x = 1
|
||||
for name, href in novel_pages.items():
|
||||
name = name.replace('?', '')
|
||||
if f'{name}.txt' in os.listdir(category):
|
||||
print(f'{name} 已存在')
|
||||
continue
|
||||
link = f'http://www.vbiquge.co/api/txt_down.php?articleid={href}&articlename={name}'
|
||||
resp = requests.get(link, headers=headers).text
|
||||
with open(f'{category}/{name}.txt', 'w', encoding='utf-8') as f:
|
||||
f.write(resp)
|
||||
print(f'正在下载第{i}页第{x}篇:{name}')
|
||||
x += 1
|
||||
print('下载完毕')
|
||||
Loading…
Reference in new issue