You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
hunjianghu/gzy/数据库/ParseHtml.py

74 lines
2.1 KiB

# coding = utf-8
from pyquery import PyQuery as pq
import getvalue as gv
import requests
import getBooks
import os
path = './Books'
def del_file(path):
ls = os.listdir(path)
for i in ls:
c_path = os.path.join(path, i)
if os.path.isdir(c_path):
del_file(c_path)
else:
os.remove(c_path)
def mkdir(path):
if os.path.exists(path):
del_file(path)
else: os.mkdir(path)
def storebook(spath, BookId, BookName, Price, Author, introduce, Type, Imgae):
book_info = os.path.join(spath,BookName)
with open(book_info,'a',encoding='utf-8') as file:
file.write(BookId+'\n')
file.write(BookName+'\n')
file.write(str(Price)+'\n')
file.write(Author+'\n')
file.write(Type+'\n')
book_introduce = os.path.join(spath,BookName+".txt")
with open(book_introduce,'a',encoding='utf-8') as file:
file.write(introduce)
book_Image = os.path.join(spath,BookName+'.jpg')
context = requests.get(Imgae).content
with open(book_Image,'wb') as file:
file.write(context)
def main():
total = 1
getBooks.getBookinfo()
for page in getBooks.dict:
print(page)
store = os.path.join(path, page)
spath = store + '小说'
mkdir(spath)
with open(store, encoding='utf-8') as file:
line = file.readline()
while line:
url, bookname = line.split()[0], line.split()[1]
response = requests.get(url)
doc = pq(response.text)
html = str(doc)
BookId = str(total)
BookName = gv.getBookName(html)
Price = 111
Author = gv.getAuthor(html)
introduce = gv.getText(html)
Type = page
Imgae = gv.getImage(html)
storebook(spath, BookId, BookName, Price, Author, introduce, Type, Imgae)
print(total,BookName)
line = file.readline()
total = total + 1
if __name__ == '__main__':
main()