Compare commits

..

6 Commits

@ -1,6 +1,10 @@
# Spider
本项目包含各种网站的爬虫,持续更新,~~时间少,更新慢~~
项目列表~~(包括画的大饼)~~
- [x] [久久小说网小说爬取](%E4%B9%85%E4%B9%85%E5%B0%8F%E8%AF%B4%E7%BD%91)
- [ ] 必应每日壁纸爬取
- [ ] B站视频爬取
# 久久小说网小说爬取
本程序可分类别下载[久久小说网](https://txt909.com/)所有的小说
特性:
- 本程序使用**requests**和**BeautifulSoup**实现网页访问与解析
- 可通过变量`category`修改下载的小说分类
- 下载的小说分别存放在对应分类下的目录中
- 可通过变量`start_page`和`end_page`来指定下载的页数范围
- 会自行判断是否已下载过某篇小说避免重复下载
- 每下载一篇小说会在终端输出相应信息
- 更多特性请自行探索

@ -0,0 +1,58 @@
import requests
from bs4 import BeautifulSoup
import os
def get_download_link(url, page):
'''返回小说的名字和下载链接'''
resp = requests.get(f'{url}{page}.html', headers=headers)
html = resp.content.decode('utf-8')
soup = BeautifulSoup(html, 'lxml')
div = soup.find_all('div', attrs={'class': 'listbg'})
novel_dic = {a.find_all('a')[0].attrs['title']: a.find_all('a')[
0].attrs['href'][5:-5] for a in div}
return novel_dic
def download(name, href, page, x):
'''下载小说'''
name = name.replace('?', '')
if f'{name}.txt' in os.listdir(category):
print(f'{name} 已存在')
return
link = f'http://www.vbiquge.co/api/txt_down.php?articleid={href}&articlename={name}'
resp = requests.get(link, headers=headers).text
with open(f'{category}/{name}.txt', 'w', encoding='utf-8') as f:
f.write(resp)
print(f'正在下载第{page}页第{x}篇:{name}')
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36'
}
categories = {
'穿越小说': 'https://www.txt909.com/html/chuanyue/',
'言情小说': 'https://www.txt909.com/html/yanqing/',
'现代都市': 'https://www.txt909.com/html/dushi/',
'耽美百合': 'https://www.txt909.com/html/baihe/',
'历史架空': 'https://www.txt909.com/html/lishi/',
'美文同人': 'https://www.txt909.com/html/tongren/',
'武侠仙侠': 'https://www.txt909.com/html/wuxia/',
'玄幻小说': 'https://www.txt909.com/html/xuanhuan/',
'惊悚悬疑': 'https://www.txt909.com/html/jingsong/',
'科幻小说': 'https://www.txt909.com/html/kehuan/',
'网游竞技': 'https://www.txt909.com/html/wangyou/'}
category = '科幻小说'
start_page = 1
end_page = 200
if categories.get(category, 0):
if not os.path.exists('./'+category):
os.mkdir('./'+category)
for page in range(start_page, end_page):
novel_dic = get_download_link(categories[category], page)
x = 1
for name, href in novel_dic.items():
download(name, href, page, x)
x += 1

@ -0,0 +1,31 @@
import requests
from bs4 import BeautifulSoup
import os
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36'
}
category = '武侠'
url = 'https://www.txt909.com/html/wuxia/'
for i in range(1, 208):
resp = requests.get(f'{url}{i}.html', headers=headers)
html = resp.content.decode('utf-8')
soup = BeautifulSoup(html, 'lxml')
div = soup.find_all('div', attrs={'class': 'listbg'})
novel_pages = {a.find_all('a')[0].attrs['title']: a.find_all('a')[
0].attrs['href'][5:-5] for a in div}
x = 1
for name, href in novel_pages.items():
name = name.replace('?', '')
if f'{name}.txt' in os.listdir(category):
print(f'{name} 已存在')
continue
link = f'http://www.vbiquge.co/api/txt_down.php?articleid={href}&articlename={name}'
resp = requests.get(link, headers=headers).text
with open(f'{category}/{name}.txt', 'w', encoding='utf-8') as f:
f.write(resp)
print(f'正在下载第{i}页第{x}篇:{name}')
x += 1
print('下载完毕')
Loading…
Cancel
Save