Update README.md

ADD file via upload
Update README.md
3 changed files with 99 additions and 6 deletions
--- a/README.md
+++ b/README.md
@ -1,6 +1,10 @@
-# Spider
-本项目包含各种网站的爬虫，持续更新，~~时间少，更新慢~~
-项目列表~~（包括画的大饼）~~
- [x] [久久小说网小说爬取](%E4%B9%85%E4%B9%85%E5%B0%8F%E8%AF%B4%E7%BD%91)
- [ ] 必应每日壁纸爬取
- [ ] B站视频爬取
+# 久久小说网小说爬取
+本程序可分类别下载[久久小说网](https://txt909.com/)所有的小说
+特性：
+- 本程序使用**requests**和**BeautifulSoup**实现网页访问与解析
+- 可通过变量`category`修改下载的小说分类
+- 下载的小说分别存放在对应分类下的目录中
+- 可通过变量`start_page`和`end_page`来指定下载的页数范围
+- 会自行判断是否已下载过某篇小说避免重复下载
+- 每下载一篇小说会在终端输出相应信息
+- 更多特性请自行探索
--- a/久久小说网.py
+++ b/久久小说网.py
@ -0,0 +1,58 @@
+import requests
+from bs4 import BeautifulSoup
+import os
+
+
+def get_download_link(url, page):
+    '''返回小说的名字和下载链接'''
+    resp = requests.get(f'{url}{page}.html', headers=headers)
+    html = resp.content.decode('utf-8')
+    soup = BeautifulSoup(html, 'lxml')
+    div = soup.find_all('div', attrs={'class': 'listbg'})
+    novel_dic = {a.find_all('a')[0].attrs['title']: a.find_all('a')[
+        0].attrs['href'][5:-5] for a in div}
+    return novel_dic
+
+
+def download(name, href, page, x):
+    '''下载小说'''
+    name = name.replace('?', '')
+    if f'{name}.txt' in os.listdir(category):
+        print(f'{name} 已存在')
+        return
+    link = f'http://www.vbiquge.co/api/txt_down.php?articleid={href}&articlename={name}'
+    resp = requests.get(link, headers=headers).text
+    with open(f'{category}/{name}.txt', 'w', encoding='utf-8') as f:
+        f.write(resp)
+    print(f'正在下载第{page}页第{x}篇：{name}')
+
+
+headers = {
+    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36'
+}
+categories = {
+    '穿越小说': 'https://www.txt909.com/html/chuanyue/',
+    '言情小说': 'https://www.txt909.com/html/yanqing/',
+    '现代都市': 'https://www.txt909.com/html/dushi/',
+    '耽美百合': 'https://www.txt909.com/html/baihe/',
+    '历史架空': 'https://www.txt909.com/html/lishi/',
+    '美文同人': 'https://www.txt909.com/html/tongren/',
+    '武侠仙侠': 'https://www.txt909.com/html/wuxia/',
+    '玄幻小说': 'https://www.txt909.com/html/xuanhuan/',
+    '惊悚悬疑': 'https://www.txt909.com/html/jingsong/',
+    '科幻小说': 'https://www.txt909.com/html/kehuan/',
+    '网游竞技': 'https://www.txt909.com/html/wangyou/'}
+category = '科幻小说'
+start_page = 1
+end_page = 200
+
+if categories.get(category, 0):
+    if not os.path.exists('./'+category):
+        os.mkdir('./'+category)
+
+    for page in range(start_page, end_page):
+        novel_dic = get_download_link(categories[category], page)
+        x = 1
+        for name, href in novel_dic.items():
+            download(name, href, page, x)
+            x += 1
--- a/久久小说网爬取.py
+++ b/久久小说网爬取.py
@ -0,0 +1,31 @@
+import requests
+from bs4 import BeautifulSoup
+import os
+
+headers = {
+    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36'
+}
+
+category = '武侠'
+url = 'https://www.txt909.com/html/wuxia/'
+for i in range(1, 208):
+    
+    resp = requests.get(f'{url}{i}.html', headers=headers)
+    html = resp.content.decode('utf-8')
+    soup = BeautifulSoup(html, 'lxml')
+    div = soup.find_all('div', attrs={'class': 'listbg'})
+    novel_pages = {a.find_all('a')[0].attrs['title']: a.find_all('a')[
+        0].attrs['href'][5:-5] for a in div}
+    x = 1
+    for name, href in novel_pages.items():
+        name = name.replace('?', '')
+        if f'{name}.txt' in os.listdir(category):
+            print(f'{name} 已存在')
+            continue
+        link = f'http://www.vbiquge.co/api/txt_down.php?articleid={href}&articlename={name}'
+        resp = requests.get(link, headers=headers).text
+        with open(f'{category}/{name}.txt', 'w', encoding='utf-8') as f:
+            f.write(resp)
+        print(f'正在下载第{i}页第{x}篇：{name}')
+        x += 1
+print('下载完毕')
Author	SHA1	Message	Date
p5htmrsic	bfec484033	Update README.md	4 years ago
p5htmrsic	84a9613c70	ADD file via upload	4 years ago
p5htmrsic	5dfb4b4a44	Update README.md	4 years ago
p5htmrsic	ae7dd4b007	Update README.md	4 years ago
p5htmrsic	c30f441676	Update README.md	4 years ago
p5htmrsic	0de80bdac4	Add 久久小说网爬取.py	4 years ago