You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
71 lines
3.7 KiB
71 lines
3.7 KiB
6 months ago
|
import pymysql
|
||
|
import requests
|
||
|
import re
|
||
|
from bs4 import BeautifulSoup
|
||
|
|
||
|
def getDB():
|
||
|
db = pymysql.connect(host='localhost',user='root',password='123456',database='douban')
|
||
|
return db
|
||
|
|
||
|
def Agent_info():
|
||
|
headers={
|
||
|
'Cookie':'118268"; bid=IO1Dg04kIm4; _vwo_uuid_v2=D1A645C6CFFBAF77D4DD98C62F188490B|754ea089c6c61f5af161e2912f2d4bee; __yadk_uid=NpVAgGKfMbTI28NFopWDALVWvzGMJt3S; _pk_id.100001.4cf6=095af3751c7a7a20.1681143032.; ap_v=0,6.0; __utmc=30149280; __utmc=223695111; dbcl2="279593631:HhdIjxDt0FA"; ck=XIW8; __utma=30149280.966668946.1681143033.1712632454.1712639313.6; __utmb=30149280.0.10.1712639313; __utmz=30149280.1712639313.6.2.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utma=223695111.1690211447.1681143033.1712632454.1712639313.6; __utmb=223695111.0.10.1712639313; __utmz=223695111.1712639313.6.2.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1712639313%2C%22https%3A%2F%2Faccounts.douban.com%2F%22%5D; _pk_ses.100001.4cf6=1; push_noty_num=0; push_doumail_num=0; __gads=ID=d8a2141b0213474a-229414c42add00a6:T=1681143032:RT=1712639314:S=ALNI_Mb89dGhTs42z60R9TMxDscyQIzA8A; __gpi=UID=00000bf05307ad13:T=1681143032:RT=1712639314:S=ALNI_MbkC2b_Z_7nO1PL2HHsgHolhWs0iw; __eoi=ID=2f9ca57c63f42bd7:T=1712496871:RT=1712639314:S=AA-AfjbV9P_SdwHly0Xzv8gyJ7ZR',
|
||
|
'Host':'movie.douban.com',
|
||
|
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0'
|
||
|
}
|
||
|
return headers
|
||
|
|
||
|
def get_url(url):
|
||
|
print("抓取网址",url)
|
||
|
headers=Agent_info()
|
||
|
request=requests.get(url,headers=headers)
|
||
|
soup=BeautifulSoup(request.text,'lxml')
|
||
|
pic=soup.find_all(attrs={'class' :'item'})
|
||
|
film_urls=[]
|
||
|
for x in pic:
|
||
|
href=x.a.get('href')
|
||
|
film_urls.append(href)
|
||
|
|
||
|
return film_urls
|
||
|
|
||
|
def get_url_info(film_url,id):
|
||
|
print("抓取网址", film_url)
|
||
|
headers = Agent_info()
|
||
|
request = requests.get(film_url, headers=headers)
|
||
|
soup = BeautifulSoup(request.text, 'lxml')
|
||
|
ranks=soup.find(attrs={'class':'top250-no'}).text.split('.')[1]
|
||
|
film_name=soup.find(attrs={'property':'v:itemreviewed'}).text
|
||
|
director=soup.find(attrs={'id':'info'}).text.split('\n')[1].split(':')[1].strip()
|
||
|
actor=soup.find(attrs={'id':'info'}).text.split('\n')[3].split(':')[1].strip().split('/')
|
||
|
actor= str(actor)
|
||
|
actor = pymysql.converters.escape_string(actor)
|
||
|
language=soup.find(attrs={'id':'info'}).text.split('\n')[6].split(':')[1].strip()
|
||
|
rating_num = soup.find(attrs={'property':'v:average'}).text
|
||
|
summary = soup.find(attrs={'property': 'v:summary'}).text
|
||
|
summary=pymysql.converters.escape_string(summary)
|
||
|
sql = 'insert into movies (film_name,director,actor,language,ranks,rating_num,summary,links) values ("{}","{}","{}","{}","{}","{}","{}","{}")'.format(film_name,director,actor,language,ranks,rating_num,summary,film_url)
|
||
|
db = getDB()
|
||
|
cursor = db.cursor()
|
||
|
try:
|
||
|
cursor.execute(sql)
|
||
|
cursor.execute('insert into moviehash(movieid) values ("{}")'.format(id))
|
||
|
db.commit()
|
||
|
except Exception as e:
|
||
|
print(e)
|
||
|
db.rollback()
|
||
|
cursor.close()
|
||
|
db.close()
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
print("开始抓取")
|
||
|
db=getDB()
|
||
|
cursor=db.cursor()
|
||
|
for i in range(0,50,25):
|
||
|
film_urls= get_url("https://movie.douban.com/top250?start="+str(i)+"&filter=")
|
||
|
for film_url in range(len(film_urls)):
|
||
|
id=re.search('\d\d+',film_urls[film_url]).group()
|
||
|
sql='select movieid from moviehash where movieid="{}";'.format(id)
|
||
|
cursor.execute(sql)
|
||
|
data=cursor.fetchall()
|
||
|
if not data:
|
||
|
get_url_info(film_urls[film_url],id)
|