|
|
import pymysql as pymysql
|
|
|
import requests
|
|
|
from bs4 import BeautifulSoup
|
|
|
import re
|
|
|
from lxml import etree
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def getDB():
|
|
|
'''连接数据库'''
|
|
|
db = pymysql.connect(host='localhost',user='root',password='shengji.',database='douban',)
|
|
|
return db
|
|
|
|
|
|
|
|
|
def Agent_info():
|
|
|
''' 用于保存cookies,url,user-agent信息'''
|
|
|
headers = {
|
|
|
'Cookie': 'douban-fav-remind=1; bid=LP2o54Mcp34; ll="118268"; ap_v=0,6.0',
|
|
|
'Host':'movie.douban.com',
|
|
|
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62'
|
|
|
}
|
|
|
return headers
|
|
|
#获取电影详情url地址列表和外国电影名字
|
|
|
def get_url(url):
|
|
|
print('抓取网址:',url)
|
|
|
headers = Agent_info()
|
|
|
request = requests.get(url,headers=headers)
|
|
|
soup = BeautifulSoup(request.text,'lxml')
|
|
|
pic = soup.find_all(attrs={'class':'pic'})
|
|
|
film_urls = [] #电影详情url地址列表
|
|
|
for x in pic:
|
|
|
href = x.a.get('href')
|
|
|
film_urls.append(href)
|
|
|
movie_list = [] #外国电影名字
|
|
|
div_list = soup.find_all('div',class_='hd')
|
|
|
for each in div_list:
|
|
|
movie = each.a.contents[3].text.strip()
|
|
|
movie = movie[2:]
|
|
|
movie_list.append(movie)
|
|
|
|
|
|
return film_urls,movie_list
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#获取电影信息
|
|
|
def get_url_info(film_url,film_name_en,id):
|
|
|
print('抓取网址:',film_url)
|
|
|
headers = Agent_info()
|
|
|
request = requests.get(film_url, headers=headers)
|
|
|
soup = BeautifulSoup(request.text, 'lxml')
|
|
|
#排名
|
|
|
ranks = soup.find(attrs={'class':"top250-no"}).text.split(".")[1]
|
|
|
#电影中文名
|
|
|
film_name = soup.find(attrs={'property':'v:itemreviewed'}).text.split(' ')[0]
|
|
|
#导演
|
|
|
director = soup.find(attrs={'id':"info"}).text.split('\n')[1].split(':')[1].strip()
|
|
|
#编剧
|
|
|
scriptwriter = soup.find(attrs={'id':"info"}).text.split('\n')[2].split(':')[1].strip()
|
|
|
#主演
|
|
|
actor = soup.find(attrs={'id':"info"}).text.split('\n')[3].split(':')[1].strip()
|
|
|
# 类型
|
|
|
filmtype = soup.find(attrs={'id': "info"}).text.split('\n')[4].split(':')[1].strip()
|
|
|
types = filmtype.split("/")
|
|
|
if soup.find(attrs={'id': "info"}).text.split('\n')[5].split(':')[0] =='官方网站':
|
|
|
# 制片国家/地区
|
|
|
area = soup.find(attrs={'id': "info"}).text.split('\n')[6].split(':')[1].strip()
|
|
|
# 语言
|
|
|
language = soup.find(attrs={'id': "info"}).text.split('\n')[7].split(':')[1].strip()
|
|
|
# 上映日期
|
|
|
initialrReleaseDate = soup.find(attrs={'id': "info"}).text.split('\n')[8].split(':')[1].split('(')[0].strip()
|
|
|
else:
|
|
|
# 制片国家/地区
|
|
|
area = soup.find(attrs={'id': "info"}).text.split('\n')[5].split(':')[1].strip()
|
|
|
# 语言
|
|
|
language = soup.find(attrs={'id': "info"}).text.split('\n')[6].split(':')[1].strip()
|
|
|
# 上映日期
|
|
|
initialrReleaseDate = soup.find(attrs={'id': "info"}).text.split('\n')[7].split(':')[1].split('(')[0].strip()
|
|
|
|
|
|
# 片长
|
|
|
runtime = soup.find(attrs={'property': "v:runtime"}).text
|
|
|
# 评分(平均分)
|
|
|
rating_num = soup.find(attrs={'property': "v:average"}).text
|
|
|
# 五星评分比例
|
|
|
stars5_rating_per = soup.find(attrs={'class': "rating_per"}).text
|
|
|
# 评价人数
|
|
|
rating_people = soup.find(attrs={'property': "v:votes"}).text
|
|
|
#剧情简介
|
|
|
summary = soup.find(attrs={'property': "v:summary"}).text
|
|
|
summary = pymysql.converters.escape_string(summary)
|
|
|
|
|
|
#存到数据库
|
|
|
sql = 'insert into movies(film_name,director,scriptwriter,actor,filmtype,area,language,initialrReleaseDate,ranks,runtime,rating_num,stars5_rating_per,rating_people,summary,film_name_en,links) values("{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}");'.format(film_name,director,scriptwriter,actor,filmtype,area,language,initialrReleaseDate,ranks,runtime,rating_num,stars5_rating_per,rating_people,summary,film_name_en,film_url)
|
|
|
db = getDB()
|
|
|
try:
|
|
|
cursor = db.cursor()
|
|
|
cursor.execute(sql)
|
|
|
cursor.execute('insert into moviehash(movieid) values("{}")'.format(id))
|
|
|
for j in range(len(types)):
|
|
|
cursor.execute('insert into movietype(movieid,filmtype) values("{}","{}")'.format(id,types[j].strip()))
|
|
|
db.commit()
|
|
|
except Exception as e:
|
|
|
print(e)
|
|
|
db.rollback()
|
|
|
cursor.close()
|
|
|
db.close()
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
print('开始抓取')
|
|
|
db = getDB()
|
|
|
cursor = db.cursor()
|
|
|
for i in range(0,250,25):
|
|
|
film_urls, movie_list = get_url("https://movie.douban.com/top250?start="+str(i)+"&filter=")
|
|
|
for film_url in range(len(film_urls)):
|
|
|
id = re.search('\d\d+',film_urls[film_url]).group()
|
|
|
sql = 'select movieid from moviehash where movieid = {}'.format(id)
|
|
|
cursor.execute(sql)
|
|
|
data = cursor.fetchall()
|
|
|
if not data:
|
|
|
get_url_info(film_urls[film_url],movie_list[film_url],id)
|
|
|
# get_url_info("https://movie.douban.com/subject/1291561/","111","1291561") |