xiaowang001/flaskProject/get_xiaoshuo.py

import pymysql
import requests
from lxml import etree
import re


def getDB():
    db = pymysql.connect(host='localhost',user='root',password='123456',database='douban')
    return db

def Agent_info():
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'
    }
    return headers

def get_url():
    headers=Agent_info()
    manh_url = 'https://www.qimao.com/paihang'
    res = requests.get(url=manh_url, headers=headers)
    htmldata=res.text
    urls=re.findall('<a href="https://www.qimao.com/shuku/(\d+)/"',htmldata)
    urls2=[]
    for url in urls:
        url="https://www.qimao.com/shuku/"+url+'/'
        if url not in urls2:
            urls2.append(url)
    return urls2

def get_url_info(manh_url):
    print("抓取网址", manh_url)
    headers = Agent_info()
    res = requests.get(manh_url, headers=headers)
    e = etree.HTML(res.text)
    ranks = e.xpath('//*[@id="__layout"]/div/div[3]/div/div/div/div[1]/div/div[1]/div[2]/div[1]/span[2]/text()')
    ranks=str(ranks)[1:-1]
    names = e.xpath('//*[@id="__layout"]/div/div[3]/div/div/div/div[1]/div/div[1]/div[2]/div[1]/span[1]/text()')
    names =str( names)[1:-1]
    authors = e.xpath('//*[@id="__layout"]/div/div[3]/div/div/div/div[1]/div/div[1]/div[2]/div[3]/span[1]/em/a/text()')
    authors =str(authors)[4:-4]
    summarys = e.xpath('//*[@id="__layout"]/div/div[3]/div/div/div/div[2]/div/div[2]/div[1]/div/div[1]/div[2]/p/text()')
    sql = 'insert into xiaoshuo (names,ranks,authors,summarys,urls) values ("{}","{}","{}","{}","{}")'.format(names,ranks,authors,summarys,manh_url)
    db = getDB()
    cursor = db.cursor()
    try:
        cursor.execute(sql)
        db.commit()
    except Exception as e:
        print(e)
        db.rollback()
    cursor.close()
    db.close()
if __name__ == '__main__':
    urls=get_url()
    for url in urls:
        get_url_info(url)