Python/getData.py

109 lines
3.9 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import pymysql as pymysql
import requests
from bs4 import BeautifulSoup
import re
from lxml import etree
def getDB():
'''连接数据库'''
db = pymysql.connect(host='localhost',user='root',password='shengji.',database='douban',)
return db
def Agent_info():
''' 用于保存cookiesurluser-agent信息'''
headers = {
'Cookie': 'douban-fav-remind=1; bid=LP2o54Mcp34; ll="118268"; ap_v=0,6.0',
'Host':'movie.douban.com',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62'
}
return headers
#获取电影详情url地址列表和外国电影名字
def get_url(url):
print('抓取网址:',url)
headers = Agent_info()
request = requests.get(url,headers=headers)
soup = BeautifulSoup(request.text,'lxml')
pic = soup.find_all(attrs={'class':'pic'})
film_urls = [] #电影详情url地址列表
for x in pic:
href = x.a.get('href')
film_urls.append(href)
movie_list = [] #外国电影名字
div_list = soup.find_all('div',class_='hd')
for each in div_list:
movie = each.a.contents[3].text.strip()
movie = movie[2:]
movie_list.append(movie)
return film_urls,movie_list
#获取电影信息
def get_url_info(film_url,film_name_en,id):
print('抓取网址:',film_url)
headers = Agent_info()
request = requests.get(film_url, headers=headers)
soup = BeautifulSoup(request.text, 'lxml')
#排名
ranks = soup.find(attrs={'class':"top250-no"}).text.split(".")[1]
#电影中文名
film_name = soup.find(attrs={'property':'v:itemreviewed'}).text.split(' ')[0]
#导演
director = soup.find(attrs={'id':"info"}).text.split('\n')[1].split(':')[1].strip()
#编剧
scriptwriter = soup.find(attrs={'id':"info"}).text.split('\n')[2].split(':')[1].strip()
#主演
actor = soup.find(attrs={'id':"info"}).text.split('\n')[3].split(':')[1].strip()
# 类型
filmtype = soup.find(attrs={'id': "info"}).text.split('\n')[4].split(':')[1].strip()
# 片长
runtime = soup.find(attrs={'property': "v:runtime"}).text
# 评分(平均分)
rating_num = soup.find(attrs={'property': "v:average"}).text
# 五星评分比例
stars5_rating_per = soup.find(attrs={'class': "rating_per"}).text
# 评价人数
rating_people = soup.find(attrs={'property': "v:votes"}).text
# 剧情简介
summary = soup.find(attrs={'property': "v:summary"}).text
summary = pymysql.converters.escape_string(summary)
sql = 'insert into movies(film_name,director,scriptwriter,actor,filmtype,area,language,initialrReleaseDate,ranks,runtime,rating_num,stars5_rating_per,rating_people,summary,film_name_en,links) values("{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}");'.format(film_name,director,scriptwriter,actor,filmtype,area,language,initialrReleaseDate,ranks,runtime,rating_num,stars5_rating_per,rating_people,summary,film_name_en,film_url)
db = getDB()
try:
cursor = db.cursor()
cursor.execute(sql)
cursor.execute('insert into moviehash(movieid) values("{}")'.format(id))
db.commit()
except Exception as e:
print(e)
db.rollback()
cursor.close()
db.close()
if __name__ == '__main__':
print('开始抓取')
db = getDB()
cursor = db.cursor()
for i in range(0,50,25):
film_urls, movie_list = get_url("https://movie.douban.com/top250?start="+str(i)+"&filter=")
for film_url in range(len(film_urls)):
id = re.search('\d\d+',film_urls[film_url]).group()
sql = 'select movieid from moviehash where movieid = {}'.format(id)
cursor.execute(sql)
data = cursor.fetchall()
if not data:
get_url_info(film_urls[film_url],movie_list[film_url],id)