import pymysql as pymysql
import requests
from bs4 import BeautifulSoup
import re
from lxml import etree
def getDB ( ) :
''' 连接数据库 '''
db = pymysql . connect ( host = ' localhost ' , user = ' root ' , password = ' shengji. ' , database = ' douban ' , )
return db
def Agent_info ( ) :
''' 用于保存cookies, url, user-agent信息 '''
headers = {
' Cookie ' : ' douban-fav-remind=1; bid=LP2o54Mcp34; ll= " 118268 " ; ap_v=0,6.0 ' ,
' Host ' : ' movie.douban.com ' ,
' User-Agent ' : ' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62 '
}
return headers
#获取电影详情url地址列表和外国电影名字
def get_url ( url ) :
print ( ' 抓取网址: ' , url )
headers = Agent_info ( )
request = requests . get ( url , headers = headers )
soup = BeautifulSoup ( request . text , ' lxml ' )
pic = soup . find_all ( attrs = { ' class ' : ' pic ' } )
film_urls = [ ] #电影详情url地址列表
for x in pic :
href = x . a . get ( ' href ' )
film_urls . append ( href )
movie_list = [ ] #外国电影名字
div_list = soup . find_all ( ' div ' , class_ = ' hd ' )
for each in div_list :
movie = each . a . contents [ 3 ] . text . strip ( )
movie = movie [ 2 : ]
movie_list . append ( movie )
return film_urls , movie_list
#获取电影信息
def get_url_info ( film_url , film_name_en , id ) :
print ( ' 抓取网址: ' , film_url )
headers = Agent_info ( )
request = requests . get ( film_url , headers = headers )
soup = BeautifulSoup ( request . text , ' lxml ' )
#排名
ranks = soup . find ( attrs = { ' class ' : " top250-no " } ) . text . split ( " . " ) [ 1 ]
#电影中文名
film_name = soup . find ( attrs = { ' property ' : ' v:itemreviewed ' } ) . text . split ( ' ' ) [ 0 ]
#导演
director = soup . find ( attrs = { ' id ' : " info " } ) . text . split ( ' \n ' ) [ 1 ] . split ( ' : ' ) [ 1 ] . strip ( )
#编剧
scriptwriter = soup . find ( attrs = { ' id ' : " info " } ) . text . split ( ' \n ' ) [ 2 ] . split ( ' : ' ) [ 1 ] . strip ( )
#主演
actor = soup . find ( attrs = { ' id ' : " info " } ) . text . split ( ' \n ' ) [ 3 ] . split ( ' : ' ) [ 1 ] . strip ( )
# 类型
filmtype = soup . find ( attrs = { ' id ' : " info " } ) . text . split ( ' \n ' ) [ 4 ] . split ( ' : ' ) [ 1 ] . strip ( )
types = filmtype . split ( " / " )
if soup . find ( attrs = { ' id ' : " info " } ) . text . split ( ' \n ' ) [ 5 ] . split ( ' : ' ) [ 0 ] == ' 官方网站 ' :
# 制片国家/地区
area = soup . find ( attrs = { ' id ' : " info " } ) . text . split ( ' \n ' ) [ 6 ] . split ( ' : ' ) [ 1 ] . strip ( )
# 语言
language = soup . find ( attrs = { ' id ' : " info " } ) . text . split ( ' \n ' ) [ 7 ] . split ( ' : ' ) [ 1 ] . strip ( )
# 上映日期
initialrReleaseDate = soup . find ( attrs = { ' id ' : " info " } ) . text . split ( ' \n ' ) [ 8 ] . split ( ' : ' ) [ 1 ] . split ( ' ( ' ) [ 0 ] . strip ( )
else :
# 制片国家/地区
area = soup . find ( attrs = { ' id ' : " info " } ) . text . split ( ' \n ' ) [ 5 ] . split ( ' : ' ) [ 1 ] . strip ( )
# 语言
language = soup . find ( attrs = { ' id ' : " info " } ) . text . split ( ' \n ' ) [ 6 ] . split ( ' : ' ) [ 1 ] . strip ( )
# 上映日期
initialrReleaseDate = soup . find ( attrs = { ' id ' : " info " } ) . text . split ( ' \n ' ) [ 7 ] . split ( ' : ' ) [ 1 ] . split ( ' ( ' ) [ 0 ] . strip ( )
# 片长
runtime = soup . find ( attrs = { ' property ' : " v:runtime " } ) . text
# 评分(平均分)
rating_num = soup . find ( attrs = { ' property ' : " v:average " } ) . text
# 五星评分比例
stars5_rating_per = soup . find ( attrs = { ' class ' : " rating_per " } ) . text
# 评价人数
rating_people = soup . find ( attrs = { ' property ' : " v:votes " } ) . text
#剧情简介
summary = soup . find ( attrs = { ' property ' : " v:summary " } ) . text
summary = pymysql . converters . escape_string ( summary )
#存到数据库
sql = ' insert into movies(film_name,director,scriptwriter,actor,filmtype,area,language,initialrReleaseDate,ranks,runtime,rating_num,stars5_rating_per,rating_people,summary,film_name_en,links) values( " {} " , " {} " , " {} " , " {} " , " {} " , " {} " , " {} " , " {} " , " {} " , " {} " , " {} " , " {} " , " {} " , " {} " , " {} " , " {} " ); ' . format ( film_name , director , scriptwriter , actor , filmtype , area , language , initialrReleaseDate , ranks , runtime , rating_num , stars5_rating_per , rating_people , summary , film_name_en , film_url )
db = getDB ( )
try :
cursor = db . cursor ( )
cursor . execute ( sql )
cursor . execute ( ' insert into moviehash(movieid) values( " {} " ) ' . format ( id ) )
for j in range ( len ( types ) ) :
cursor . execute ( ' insert into movietype(movieid,filmtype) values( " {} " , " {} " ) ' . format ( id , types [ j ] . strip ( ) ) )
db . commit ( )
except Exception as e :
print ( e )
db . rollback ( )
cursor . close ( )
db . close ( )
if __name__ == ' __main__ ' :
print ( ' 开始抓取 ' )
db = getDB ( )
cursor = db . cursor ( )
for i in range ( 0 , 250 , 25 ) :
film_urls , movie_list = get_url ( " https://movie.douban.com/top250?start= " + str ( i ) + " &filter= " )
for film_url in range ( len ( film_urls ) ) :
id = re . search ( ' \ d \ d+ ' , film_urls [ film_url ] ) . group ( )
sql = ' select movieid from moviehash where movieid = {} ' . format ( id )
cursor . execute ( sql )
data = cursor . fetchall ( )
if not data :
get_url_info ( film_urls [ film_url ] , movie_list [ film_url ] , id )
# get_url_info("https://movie.douban.com/subject/1291561/","111","1291561")