You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

123 lines
5.0 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import pymysql as pymysql
import requests
from bs4 import BeautifulSoup
import re
from lxml import etree
def getDB():
'''连接数据库'''
db = pymysql.connect(host='localhost',user='root',password='shengji.',database='douban',)
return db
def Agent_info():
''' 用于保存cookiesurluser-agent信息'''
headers = {
'Cookie': 'douban-fav-remind=1; bid=LP2o54Mcp34; ll="118268"; ap_v=0,6.0',
'Host':'movie.douban.com',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62'
}
return headers
#获取电影详情url地址列表和外国电影名字
def get_url(url):
print('抓取网址:',url)
headers = Agent_info()
request = requests.get(url,headers=headers)
soup = BeautifulSoup(request.text,'lxml')
pic = soup.find_all(attrs={'class':'pic'})
film_urls = [] #电影详情url地址列表
for x in pic:
href = x.a.get('href')
film_urls.append(href)
movie_list = [] #外国电影名字
div_list = soup.find_all('div',class_='hd')
for each in div_list:
movie = each.a.contents[3].text.strip()
movie = movie[2:]
movie_list.append(movie)
return film_urls,movie_list
#获取电影信息
def get_url_info(film_url,film_name_en,id):
print('抓取网址:',film_url)
headers = Agent_info()
request = requests.get(film_url, headers=headers)
soup = BeautifulSoup(request.text, 'lxml')
#排名
ranks = soup.find(attrs={'class':"top250-no"}).text.split(".")[1]
#电影中文名
film_name = soup.find(attrs={'property':'v:itemreviewed'}).text.split(' ')[0]
#导演
director = soup.find(attrs={'id':"info"}).text.split('\n')[1].split(':')[1].strip()
#编剧
scriptwriter = soup.find(attrs={'id':"info"}).text.split('\n')[2].split(':')[1].strip()
#主演
actor = soup.find(attrs={'id':"info"}).text.split('\n')[3].split(':')[1].strip()
# 类型
filmtype = soup.find(attrs={'id': "info"}).text.split('\n')[4].split(':')[1].strip()
types = filmtype.split("/")
if soup.find(attrs={'id': "info"}).text.split('\n')[5].split(':')[0] =='官方网站':
# 制片国家/地区
area = soup.find(attrs={'id': "info"}).text.split('\n')[6].split(':')[1].strip()
# 语言
language = soup.find(attrs={'id': "info"}).text.split('\n')[7].split(':')[1].strip()
# 上映日期
initialrReleaseDate = soup.find(attrs={'id': "info"}).text.split('\n')[8].split(':')[1].split('(')[0].strip()
else:
# 制片国家/地区
area = soup.find(attrs={'id': "info"}).text.split('\n')[5].split(':')[1].strip()
# 语言
language = soup.find(attrs={'id': "info"}).text.split('\n')[6].split(':')[1].strip()
# 上映日期
initialrReleaseDate = soup.find(attrs={'id': "info"}).text.split('\n')[7].split(':')[1].split('(')[0].strip()
# 片长
runtime = soup.find(attrs={'property': "v:runtime"}).text
# 评分(平均分)
rating_num = soup.find(attrs={'property': "v:average"}).text
# 五星评分比例
stars5_rating_per = soup.find(attrs={'class': "rating_per"}).text
# 评价人数
rating_people = soup.find(attrs={'property': "v:votes"}).text
#剧情简介
summary = soup.find(attrs={'property': "v:summary"}).text
summary = pymysql.converters.escape_string(summary)
#存到数据库
sql = 'insert into movies(film_name,director,scriptwriter,actor,filmtype,area,language,initialrReleaseDate,ranks,runtime,rating_num,stars5_rating_per,rating_people,summary,film_name_en,links) values("{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}");'.format(film_name,director,scriptwriter,actor,filmtype,area,language,initialrReleaseDate,ranks,runtime,rating_num,stars5_rating_per,rating_people,summary,film_name_en,film_url)
db = getDB()
try:
cursor = db.cursor()
cursor.execute(sql)
cursor.execute('insert into moviehash(movieid) values("{}")'.format(id))
for j in range(len(types)):
cursor.execute('insert into movietype(movieid,filmtype) values("{}","{}")'.format(id,types[j].strip()))
db.commit()
except Exception as e:
print(e)
db.rollback()
cursor.close()
db.close()
if __name__ == '__main__':
print('开始抓取')
db = getDB()
cursor = db.cursor()
for i in range(0,250,25):
film_urls, movie_list = get_url("https://movie.douban.com/top250?start="+str(i)+"&filter=")
for film_url in range(len(film_urls)):
id = re.search('\d\d+',film_urls[film_url]).group()
sql = 'select movieid from moviehash where movieid = {}'.format(id)
cursor.execute(sql)
data = cursor.fetchall()
if not data:
get_url_info(film_urls[film_url],movie_list[film_url],id)
# get_url_info("https://movie.douban.com/subject/1291561/","111","1291561")