You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
44 lines
1.5 KiB
44 lines
1.5 KiB
import requests
|
|
import csv
|
|
from bs4 import BeautifulSoup
|
|
|
|
URL = "https://www.ckxxbz.com/book/pianpianchongai/"
|
|
h={"User-Agent":
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/122.0.0.0"}
|
|
r=requests.get(url=URL,headers=h)
|
|
text=r.text
|
|
soup=BeautifulSoup(r.text,"lxml")
|
|
re=soup.select("dl dd a")
|
|
|
|
# 创建CSV文件并写入表头
|
|
with open('偏偏宠爱/偏偏宠爱.csv', 'w', newline='', encoding='utf-8') as csvfile:
|
|
fieldnames = ['标题', '内容', '网址']
|
|
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
|
writer.writeheader()
|
|
|
|
# 遍历链接并获取内容
|
|
for a in re:
|
|
img_url = a.get('href')
|
|
url1="https://www.ckxxbz.com"
|
|
full_url = url1 + img_url
|
|
|
|
response = requests.get(full_url, headers=h)
|
|
response.encoding = "utf-8"
|
|
soup = BeautifulSoup(response.text, "lxml")
|
|
p_tags = soup.find_all('p')
|
|
content = ""
|
|
for p in p_tags:
|
|
content += p.text + ""
|
|
|
|
# 将标题和内容写入txt文件
|
|
title = soup.find('h1').text
|
|
print(title + '\t' +full_url)
|
|
print(p_tags)
|
|
|
|
|
|
with open(f'偏偏宠爱/{title}.txt', 'w', encoding='utf-8') as f:
|
|
for i in content:
|
|
f.write(i)
|
|
|
|
# 将每章的网页地址、标题和正文长度写入到csv中
|
|
writer.writerow({'标题': title,'网址': full_url,}) |