You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

71 lines
3.7 KiB

import pymysql
import requests
import re
from bs4 import BeautifulSoup
def getDB():
db = pymysql.connect(host='localhost',user='root',password='123456',database='douban')
return db
def Agent_info():
headers={
'Cookie':'118268"; bid=IO1Dg04kIm4; _vwo_uuid_v2=D1A645C6CFFBAF77D4DD98C62F188490B|754ea089c6c61f5af161e2912f2d4bee; __yadk_uid=NpVAgGKfMbTI28NFopWDALVWvzGMJt3S; _pk_id.100001.4cf6=095af3751c7a7a20.1681143032.; ap_v=0,6.0; __utmc=30149280; __utmc=223695111; dbcl2="279593631:HhdIjxDt0FA"; ck=XIW8; __utma=30149280.966668946.1681143033.1712632454.1712639313.6; __utmb=30149280.0.10.1712639313; __utmz=30149280.1712639313.6.2.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utma=223695111.1690211447.1681143033.1712632454.1712639313.6; __utmb=223695111.0.10.1712639313; __utmz=223695111.1712639313.6.2.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1712639313%2C%22https%3A%2F%2Faccounts.douban.com%2F%22%5D; _pk_ses.100001.4cf6=1; push_noty_num=0; push_doumail_num=0; __gads=ID=d8a2141b0213474a-229414c42add00a6:T=1681143032:RT=1712639314:S=ALNI_Mb89dGhTs42z60R9TMxDscyQIzA8A; __gpi=UID=00000bf05307ad13:T=1681143032:RT=1712639314:S=ALNI_MbkC2b_Z_7nO1PL2HHsgHolhWs0iw; __eoi=ID=2f9ca57c63f42bd7:T=1712496871:RT=1712639314:S=AA-AfjbV9P_SdwHly0Xzv8gyJ7ZR',
'Host':'movie.douban.com',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0'
}
return headers
def get_url(url):
print("抓取网址",url)
headers=Agent_info()
request=requests.get(url,headers=headers)
soup=BeautifulSoup(request.text,'lxml')
pic=soup.find_all(attrs={'class' :'item'})
film_urls=[]
for x in pic:
href=x.a.get('href')
film_urls.append(href)
return film_urls
def get_url_info(film_url,id):
print("抓取网址", film_url)
headers = Agent_info()
request = requests.get(film_url, headers=headers)
soup = BeautifulSoup(request.text, 'lxml')
ranks=soup.find(attrs={'class':'top250-no'}).text.split('.')[1]
film_name=soup.find(attrs={'property':'v:itemreviewed'}).text
director=soup.find(attrs={'id':'info'}).text.split('\n')[1].split(':')[1].strip()
actor=soup.find(attrs={'id':'info'}).text.split('\n')[3].split(':')[1].strip().split('/')
actor= str(actor)
actor = pymysql.converters.escape_string(actor)
language=soup.find(attrs={'id':'info'}).text.split('\n')[6].split(':')[1].strip()
rating_num = soup.find(attrs={'property':'v:average'}).text
summary = soup.find(attrs={'property': 'v:summary'}).text
summary=pymysql.converters.escape_string(summary)
sql = 'insert into movies (film_name,director,actor,language,ranks,rating_num,summary,links) values ("{}","{}","{}","{}","{}","{}","{}","{}")'.format(film_name,director,actor,language,ranks,rating_num,summary,film_url)
db = getDB()
cursor = db.cursor()
try:
cursor.execute(sql)
cursor.execute('insert into moviehash(movieid) values ("{}")'.format(id))
db.commit()
except Exception as e:
print(e)
db.rollback()
cursor.close()
db.close()
if __name__ == '__main__':
print("开始抓取")
db=getDB()
cursor=db.cursor()
for i in range(0,50,25):
film_urls= get_url("https://movie.douban.com/top250?start="+str(i)+"&filter=")
for film_url in range(len(film_urls)):
id=re.search('\d\d+',film_urls[film_url]).group()
sql='select movieid from moviehash where movieid="{}";'.format(id)
cursor.execute(sql)
data=cursor.fetchall()
if not data:
get_url_info(film_urls[film_url],id)