You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

123 lines
5.0 KiB

6 months ago
import pymysql as pymysql
import requests
from bs4 import BeautifulSoup
import re
from lxml import etree
def getDB():
'''连接数据库'''
db = pymysql.connect(host='localhost',user='root',password='shengji.',database='douban',)
return db
def Agent_info():
''' 用于保存cookiesurluser-agent信息'''
headers = {
'Cookie': 'douban-fav-remind=1; bid=LP2o54Mcp34; ll="118268"; ap_v=0,6.0',
'Host':'movie.douban.com',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62'
}
return headers
#获取电影详情url地址列表和外国电影名字
def get_url(url):
print('抓取网址:',url)
headers = Agent_info()
request = requests.get(url,headers=headers)
soup = BeautifulSoup(request.text,'lxml')
pic = soup.find_all(attrs={'class':'pic'})
film_urls = [] #电影详情url地址列表
for x in pic:
href = x.a.get('href')
film_urls.append(href)
movie_list = [] #外国电影名字
div_list = soup.find_all('div',class_='hd')
for each in div_list:
movie = each.a.contents[3].text.strip()
movie = movie[2:]
movie_list.append(movie)
return film_urls,movie_list
#获取电影信息
def get_url_info(film_url,film_name_en,id):
print('抓取网址:',film_url)
headers = Agent_info()
request = requests.get(film_url, headers=headers)
soup = BeautifulSoup(request.text, 'lxml')
#排名
ranks = soup.find(attrs={'class':"top250-no"}).text.split(".")[1]
#电影中文名
film_name = soup.find(attrs={'property':'v:itemreviewed'}).text.split(' ')[0]
#导演
director = soup.find(attrs={'id':"info"}).text.split('\n')[1].split(':')[1].strip()
#编剧
scriptwriter = soup.find(attrs={'id':"info"}).text.split('\n')[2].split(':')[1].strip()
#主演
actor = soup.find(attrs={'id':"info"}).text.split('\n')[3].split(':')[1].strip()
# 类型
filmtype = soup.find(attrs={'id': "info"}).text.split('\n')[4].split(':')[1].strip()
6 months ago
types = filmtype.split("/")
if soup.find(attrs={'id': "info"}).text.split('\n')[5].split(':')[0] =='官方网站':
# 制片国家/地区
area = soup.find(attrs={'id': "info"}).text.split('\n')[6].split(':')[1].strip()
# 语言
language = soup.find(attrs={'id': "info"}).text.split('\n')[7].split(':')[1].strip()
# 上映日期
initialrReleaseDate = soup.find(attrs={'id': "info"}).text.split('\n')[8].split(':')[1].split('(')[0].strip()
else:
# 制片国家/地区
area = soup.find(attrs={'id': "info"}).text.split('\n')[5].split(':')[1].strip()
# 语言
language = soup.find(attrs={'id': "info"}).text.split('\n')[6].split(':')[1].strip()
# 上映日期
initialrReleaseDate = soup.find(attrs={'id': "info"}).text.split('\n')[7].split(':')[1].split('(')[0].strip()
6 months ago
# 片长
runtime = soup.find(attrs={'property': "v:runtime"}).text
# 评分(平均分)
rating_num = soup.find(attrs={'property': "v:average"}).text
# 五星评分比例
stars5_rating_per = soup.find(attrs={'class': "rating_per"}).text
# 评价人数
rating_people = soup.find(attrs={'property': "v:votes"}).text
6 months ago
#剧情简介
6 months ago
summary = soup.find(attrs={'property': "v:summary"}).text
summary = pymysql.converters.escape_string(summary)
6 months ago
#存到数据库
sql = 'insert into movies(film_name,director,scriptwriter,actor,filmtype,area,language,initialrReleaseDate,ranks,runtime,rating_num,stars5_rating_per,rating_people,summary,film_name_en,links) values("{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}");'.format(film_name,director,scriptwriter,actor,filmtype,area,language,initialrReleaseDate,ranks,runtime,rating_num,stars5_rating_per,rating_people,summary,film_name_en,film_url)
6 months ago
db = getDB()
try:
cursor = db.cursor()
cursor.execute(sql)
cursor.execute('insert into moviehash(movieid) values("{}")'.format(id))
6 months ago
for j in range(len(types)):
cursor.execute('insert into movietype(movieid,filmtype) values("{}","{}")'.format(id,types[j].strip()))
6 months ago
db.commit()
except Exception as e:
print(e)
db.rollback()
cursor.close()
db.close()
6 months ago
if __name__ == '__main__':
print('开始抓取')
db = getDB()
cursor = db.cursor()
6 months ago
for i in range(0,250,25):
6 months ago
film_urls, movie_list = get_url("https://movie.douban.com/top250?start="+str(i)+"&filter=")
for film_url in range(len(film_urls)):
id = re.search('\d\d+',film_urls[film_url]).group()
sql = 'select movieid from moviehash where movieid = {}'.format(id)
cursor.execute(sql)
data = cursor.fetchall()
if not data:
get_url_info(film_urls[film_url],movie_list[film_url],id)
6 months ago
# get_url_info("https://movie.douban.com/subject/1291561/","111","1291561")