hunjianghu/gzy/数据库/ParseHtml.py

# coding = utf-8

from pyquery import PyQuery as pq
import getvalue as gv
import requests
import getBooks
import os

path = './Books'


def del_file(path):
    ls = os.listdir(path)
    for i in ls:
        c_path = os.path.join(path, i)
        if os.path.isdir(c_path):
            del_file(c_path)
        else:
            os.remove(c_path)


def mkdir(path):
    if os.path.exists(path):
        del_file(path)
    else: os.mkdir(path)


def storebook(spath, BookId, BookName, Price, Author, introduce, Type, Imgae):
    book_info = os.path.join(spath,BookName)
    with open(book_info,'a',encoding='utf-8') as file:
        file.write(BookId+'\n')
        file.write(BookName+'\n')
        file.write(str(Price)+'\n')
        file.write(Author+'\n')
        file.write(Type+'\n')
    book_introduce = os.path.join(spath,BookName+".txt")
    with open(book_introduce,'a',encoding='utf-8') as file:
        file.write(introduce)
    book_Image = os.path.join(spath,BookName+'.jpg')
    context = requests.get(Imgae).content
    with open(book_Image,'wb') as file:
        file.write(context)

def main():
    total = 1
    getBooks.getBookinfo()
    for page in getBooks.dict:
        print(page)
        store = os.path.join(path, page)
        spath = store + '小说'
        mkdir(spath)
        with open(store, encoding='utf-8') as file:
            line = file.readline()
            while line:
                url, bookname = line.split()[0], line.split()[1]
                response = requests.get(url)
                doc = pq(response.text)
                html = str(doc)
                BookId = str(total)
                BookName = gv.getBookName(html)
                Price = 111
                Author = gv.getAuthor(html)
                introduce = gv.getText(html)
                Type = page
                Imgae = gv.getImage(html)
                storebook(spath, BookId, BookName, Price, Author, introduce, Type, Imgae)
                print(total,BookName)
                line = file.readline()
                total = total + 1


if __name__ == '__main__':
    main()