You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

63 lines
1.9 KiB

import pymysql
import requests
from lxml import etree
import re
def getDB():
db = pymysql.connect(host='localhost',user='root',password='123456',database='douban')
return db
def Agent_info():
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'
}
return headers
def get_url():
headers=Agent_info()
manh_url = 'https://www.qimao.com/paihang'
res = requests.get(url=manh_url, headers=headers)
htmldata=res.text
urls=re.findall('<a href="https://www.qimao.com/shuku/(\d+)/"',htmldata)
urls2=[]
for url in urls:
url="https://www.qimao.com/shuku/"+url+'/'
if url not in urls2:
urls2.append(url)
return urls2
def get_url_info(manh_url):
print("抓取网址", manh_url)
headers = Agent_info()
res = requests.get(manh_url, headers=headers)
e = etree.HTML(res.text)
ranks = e.xpath('//*[@id="__layout"]/div/div[3]/div/div/div/div[1]/div/div[1]/div[2]/div[1]/span[2]/text()')
ranks=str(ranks)[1:-1]
names = e.xpath('//*[@id="__layout"]/div/div[3]/div/div/div/div[1]/div/div[1]/div[2]/div[1]/span[1]/text()')
names =str( names)[1:-1]
authors = e.xpath('//*[@id="__layout"]/div/div[3]/div/div/div/div[1]/div/div[1]/div[2]/div[3]/span[1]/em/a/text()')
authors =str(authors)[4:-4]
summarys = e.xpath('//*[@id="__layout"]/div/div[3]/div/div/div/div[2]/div/div[2]/div[1]/div/div[1]/div[2]/p/text()')
sql = 'insert into xiaoshuo (names,ranks,authors,summarys,urls) values ("{}","{}","{}","{}","{}")'.format(names,ranks,authors,summarys,manh_url)
db = getDB()
cursor = db.cursor()
try:
cursor.execute(sql)
db.commit()
except Exception as e:
print(e)
db.rollback()
cursor.close()
db.close()
if __name__ == '__main__':
urls=get_url()
for url in urls:
get_url_info(url)