parent
340bc5134b
commit
aba3ab16b6
@ -0,0 +1,96 @@
|
|||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import csv
|
||||||
|
import os
|
||||||
|
|
||||||
|
url='http://www.lazytxt.shop:8081/xs/112/112040/'
|
||||||
|
|
||||||
|
header={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0"}
|
||||||
|
response = requests.get(url, headers=header)
|
||||||
|
response.encoding="gbk"
|
||||||
|
#print(response.url)
|
||||||
|
#print(response.text)
|
||||||
|
soup=BeautifulSoup(response.text,"lxml")
|
||||||
|
re = soup.select('div.book_list ul li a')
|
||||||
|
#print(re)
|
||||||
|
|
||||||
|
dir_name = "许一个愿忘记你"
|
||||||
|
if not os.path.exists(dir_name):
|
||||||
|
os.mkdir(dir_name)
|
||||||
|
|
||||||
|
with open('许一个愿忘记你/许一个愿忘记你.csv', 'w', newline='', encoding='utf-8') as csvfile:
|
||||||
|
fieldnames = ['链接']
|
||||||
|
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
||||||
|
writer.writeheader()
|
||||||
|
urls=[]
|
||||||
|
for i in re:
|
||||||
|
c_url = i.get('href')
|
||||||
|
url1="http://www.lazytxt.shop:8081"
|
||||||
|
full_url = url1 + c_url
|
||||||
|
writer.writerow({'链接': full_url})
|
||||||
|
print(full_url)
|
||||||
|
urls.append(full_url)
|
||||||
|
print(len(urls))
|
||||||
|
|
||||||
|
re1 = soup.select('span.red a,h3.bookinfo_intro')
|
||||||
|
for i in re1:#获取书名作者简介
|
||||||
|
print(i.text)
|
||||||
|
|
||||||
|
for i in re:
|
||||||
|
c_url = i.get('href')
|
||||||
|
url1 = "http://www.lazytxt.shop:8081"
|
||||||
|
full_url = url1 + c_url
|
||||||
|
response = requests.get(full_url, headers=header)
|
||||||
|
response.encoding = "gbk"
|
||||||
|
soup = BeautifulSoup(response.text, "lxml")
|
||||||
|
chapter = soup.select_one('div.h1title h1').text
|
||||||
|
print(chapter)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
for i in re:
|
||||||
|
c_url = i.get('href')
|
||||||
|
url1 = "http://www.lazytxt.shop:8081"
|
||||||
|
full_url = url1 + c_url
|
||||||
|
response = requests.get(full_url, headers=header)
|
||||||
|
response.encoding = "gbk"
|
||||||
|
soup = BeautifulSoup(response.text, 'lxml')
|
||||||
|
chapter = soup.select_one('div.h1title h1').text
|
||||||
|
print()
|
||||||
|
print(chapter)
|
||||||
|
content = soup.find('div', {'id': 'htmlContent'}).text.strip()
|
||||||
|
print(content)
|
||||||
|
#content = content.replace(" ", "")
|
||||||
|
file_name = os.path.join(dir_name, f"{chapter}.txt")
|
||||||
|
with open(file_name, 'w', encoding='utf-8') as f:
|
||||||
|
f.write(content)
|
||||||
|
|
||||||
|
def remove_leading_spaces_in_txt_files(directory):
|
||||||
|
# 遍历指定目录下的所有文件和子目录
|
||||||
|
for root, dirs, files in os.walk(directory):
|
||||||
|
for file in files:
|
||||||
|
# 检查文件是否为.txt文件
|
||||||
|
if file.endswith('.txt'):
|
||||||
|
# 构建文件的完整路径
|
||||||
|
file_path = os.path.join(root, file)
|
||||||
|
|
||||||
|
# 读取文件内容
|
||||||
|
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
||||||
|
content = f.read()
|
||||||
|
|
||||||
|
# 删除每行前面的空格,并替换文件内容
|
||||||
|
# 注意:这里假设每行都应该有相同的处理,即删除行首空格
|
||||||
|
modified_content = '\n'.join(line.lstrip() for line in content.splitlines())
|
||||||
|
|
||||||
|
# 写回文件内容
|
||||||
|
with open(file_path, 'w', encoding='utf-8') as f:
|
||||||
|
f.write(modified_content)
|
||||||
|
|
||||||
|
# 使用函数,替换为你的目录路径
|
||||||
|
|
||||||
|
|
||||||
|
remove_leading_spaces_in_txt_files('许一个愿忘记你')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in new issue