You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
63 lines
1.9 KiB
63 lines
1.9 KiB
import pymysql
|
|
import requests
|
|
from lxml import etree
|
|
import re
|
|
|
|
|
|
def getDB():
|
|
db = pymysql.connect(host='localhost',user='root',password='123456',database='douban')
|
|
return db
|
|
|
|
def Agent_info():
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'
|
|
}
|
|
return headers
|
|
|
|
def get_url():
|
|
headers=Agent_info()
|
|
manh_url = 'https://www.qimao.com/paihang'
|
|
res = requests.get(url=manh_url, headers=headers)
|
|
htmldata=res.text
|
|
urls=re.findall('<a href="https://www.qimao.com/shuku/(\d+)/"',htmldata)
|
|
urls2=[]
|
|
for url in urls:
|
|
url="https://www.qimao.com/shuku/"+url+'/'
|
|
if url not in urls2:
|
|
urls2.append(url)
|
|
return urls2
|
|
|
|
def get_url_info(manh_url):
|
|
print("抓取网址", manh_url)
|
|
headers = Agent_info()
|
|
res = requests.get(manh_url, headers=headers)
|
|
e = etree.HTML(res.text)
|
|
ranks = e.xpath('//*[@id="__layout"]/div/div[3]/div/div/div/div[1]/div/div[1]/div[2]/div[1]/span[2]/text()')
|
|
ranks=str(ranks)[1:-1]
|
|
names = e.xpath('//*[@id="__layout"]/div/div[3]/div/div/div/div[1]/div/div[1]/div[2]/div[1]/span[1]/text()')
|
|
names =str( names)[1:-1]
|
|
authors = e.xpath('//*[@id="__layout"]/div/div[3]/div/div/div/div[1]/div/div[1]/div[2]/div[3]/span[1]/em/a/text()')
|
|
authors =str(authors)[4:-4]
|
|
summarys = e.xpath('//*[@id="__layout"]/div/div[3]/div/div/div/div[2]/div/div[2]/div[1]/div/div[1]/div[2]/p/text()')
|
|
sql = 'insert into xiaoshuo (names,ranks,authors,summarys,urls) values ("{}","{}","{}","{}","{}")'.format(names,ranks,authors,summarys,manh_url)
|
|
db = getDB()
|
|
cursor = db.cursor()
|
|
try:
|
|
cursor.execute(sql)
|
|
db.commit()
|
|
except Exception as e:
|
|
print(e)
|
|
db.rollback()
|
|
cursor.close()
|
|
db.close()
|
|
if __name__ == '__main__':
|
|
urls=get_url()
|
|
for url in urls:
|
|
get_url_info(url)
|
|
|
|
|
|
|
|
|
|
|
|
|