You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

61 lines
2.1 KiB

import pymysql
import requests
from lxml import etree
def getDB():
db = pymysql.connect(host='localhost',user='root',password='123456',database='douban')
return db
def Agent_info():
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0'
}
return headers
def get_url():
headers=Agent_info()
manh_url = 'https://ac.qq.com/Rank/comicRank/type/top'
res = requests.get(url=manh_url, headers=headers)
e = etree.HTML(res.text)
hrefs = e.xpath('//div/ul/li/a/@href')
hrefs = hrefs[16:38]
for i in range(22):
hrefs[i] = "https://ac.qq.com/" + hrefs[i]
return hrefs
def get_url_info(manh_url):
print("抓取网址", manh_url)
headers = Agent_info()
res = requests.get(manh_url, headers=headers)
e = etree.HTML(res.text)
ranks = e.xpath('//*[@id="special_bg"]/div[3]/div/div/div[2]/div/div[2]/p/strong/text()')
ranks=str(ranks).strip('[').strip(']')
names = e.xpath('//*[@id="special_bg"]/div[3]/div/div/div[2]/div/div[1]/h2/strong/text()')
names=str(names).strip('[').strip(']')
authors = e.xpath('//*[@id="special_bg"]/div[3]/div/div/div[2]/div/p/span/em/text()')
authors1 = e.xpath('//*[@id="special_bg"]/div[3]/div/div/div[2]/div/p/span/em/text()')
authors2 = e.xpath('//*[@id="special_bg"]/div[3]/div/div/div[2]/div/p/span/em/text()')
authors0 = str(authors).split(',')[0]
authors0 = authors0[2:-5]
renqi = str(authors1).split(',')[1]
shoucang = str(authors2).split(',')[2]
shoucang=shoucang[:-1]
print(authors)
sql = 'insert into manh (names,ranks,renqi,shoucang,urls,authors) values ("{}","{}","{}","{}","{}","{}")'.format(names,ranks,renqi,shoucang,manh_url,authors0)
db = getDB()
cursor = db.cursor()
try:
cursor.execute(sql)
db.commit()
except Exception as e:
print(e)
db.rollback()
cursor.close()
db.close()
if __name__ == '__main__':
manh_urls = get_url()
for url in manh_urls:
get_url_info(url)