You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
61 lines
2.1 KiB
61 lines
2.1 KiB
import pymysql
|
|
import requests
|
|
from lxml import etree
|
|
|
|
|
|
def getDB():
|
|
db = pymysql.connect(host='localhost',user='root',password='123456',database='douban')
|
|
return db
|
|
|
|
def Agent_info():
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0'
|
|
}
|
|
return headers
|
|
|
|
def get_url():
|
|
headers=Agent_info()
|
|
manh_url = 'https://ac.qq.com/Rank/comicRank/type/top'
|
|
res = requests.get(url=manh_url, headers=headers)
|
|
e = etree.HTML(res.text)
|
|
hrefs = e.xpath('//div/ul/li/a/@href')
|
|
hrefs = hrefs[16:38]
|
|
for i in range(22):
|
|
hrefs[i] = "https://ac.qq.com/" + hrefs[i]
|
|
return hrefs
|
|
|
|
def get_url_info(manh_url):
|
|
print("抓取网址", manh_url)
|
|
headers = Agent_info()
|
|
res = requests.get(manh_url, headers=headers)
|
|
e = etree.HTML(res.text)
|
|
ranks = e.xpath('//*[@id="special_bg"]/div[3]/div/div/div[2]/div/div[2]/p/strong/text()')
|
|
ranks=str(ranks).strip('[').strip(']')
|
|
names = e.xpath('//*[@id="special_bg"]/div[3]/div/div/div[2]/div/div[1]/h2/strong/text()')
|
|
names=str(names).strip('[').strip(']')
|
|
authors = e.xpath('//*[@id="special_bg"]/div[3]/div/div/div[2]/div/p/span/em/text()')
|
|
authors1 = e.xpath('//*[@id="special_bg"]/div[3]/div/div/div[2]/div/p/span/em/text()')
|
|
authors2 = e.xpath('//*[@id="special_bg"]/div[3]/div/div/div[2]/div/p/span/em/text()')
|
|
|
|
authors0 = str(authors).split(',')[0]
|
|
authors0 = authors0[2:-5]
|
|
renqi = str(authors1).split(',')[1]
|
|
shoucang = str(authors2).split(',')[2]
|
|
shoucang=shoucang[:-1]
|
|
print(authors)
|
|
sql = 'insert into manh (names,ranks,renqi,shoucang,urls,authors) values ("{}","{}","{}","{}","{}","{}")'.format(names,ranks,renqi,shoucang,manh_url,authors0)
|
|
db = getDB()
|
|
cursor = db.cursor()
|
|
try:
|
|
cursor.execute(sql)
|
|
db.commit()
|
|
except Exception as e:
|
|
print(e)
|
|
db.rollback()
|
|
cursor.close()
|
|
db.close()
|
|
if __name__ == '__main__':
|
|
|
|
manh_urls = get_url()
|
|
for url in manh_urls:
|
|
get_url_info(url) |