You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

232 lines
7.9 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

from bs4 import BeautifulSoup
import re
import urllib.error,urllib.request
import xlwt
import sqlite3
#对于豆瓣top250电影爬取内容的正则表达式
findLink = re.compile(r'<a href="(.*?)">')
findImgSrc = re.compile(r'<img.*src="(.*?)"',re.S)
findTitle = re.compile(r'<span class="title">(.*)</span>')
findRating = re.compile(r'<span class="rating_num" property="v:average">(.*)</span>')
findJudge = re.compile(r'<span>(\d*)人评价</span>')
findBd = re.compile(r'<p class="">(.*?)</p>',re.S)
#对于豆瓣当前某城市正在上映的电影爬取内容的正则表达式
findnowmovielink = re.compile(r'<a data-psource="poster" href="(.*?)" target="_blank">',re.S)
findnowmovieimg = re.compile(r'src="(.*?)"/>')
findnowmovietitle= re.compile(r'data-title="(.*?)"')
findnowmovieactors= re.compile(r'data-actors="(.*?)"')
findnowmovietime= re.compile(r'data-duration="(.*?)"')
findnowmovieregion= re.compile(r'data-region="(.*?)"')
findnowmoviedirector= re.compile(r'data-director="(.*?)"')
findnowmovieid = re.compile(r'id="(.*?)"')
def main():
a = input("Enter a number: ")
if a=="1":
baseurl = "https://movie.douban.com/top250?start=" #指定的网页超链接
datalist = getData1(baseurl) #返回一个装有所有数据的列表
savepath = ".\\doubanmovietop250.xls" #存储数据的文档路径
saveData1(datalist,savepath) #存储数据
dbpath = "doubanmovietop250.db"
saveData2DB1(datalist,dbpath)
if a=="2":
b = input("Enter a city: ")
baseurl = "https://movie.douban.com/cinema/nowplaying/" + b + '/'
c = getData2(baseurl)
savepath = ".\\doubannowplaying.xls"
saveData2(c,savepath)
#用于获得数据
def getData1(baseurl):
datalist = [] #将获得的数据装于datalist这个列表
for i in range(0,10): #网页一共10页遍历爬取
url = baseurl + str(i*25)
html = askURL1(url)
soup = BeautifulSoup(html,"html.parser") #解析
for item in soup.find_all('div',class_= "item"):
#print(item) #一个测试点
data = [] #将每一部电影的具体信息装入data列表
item = str(item)
link = re.findall(findLink,item)[0] #详情链接
data.append(link)
imgSrc = re.findall(findImgSrc,item)[0] #图片链接
data.append(imgSrc)
titles = re.findall(findTitle,item) #标题
if(len(titles) == 2 ):
ctitle = titles[0]
data.append(ctitle)
otitle = titles[1].replace("/","")
data.append(otitle)
else:
data.append(titles[0])
data.append(" ")
rating = re.findall(findRating,item)[0] #评分
data.append(rating)
judgeNum = re.findall(findJudge,item)[0] #评分人数
data.append(judgeNum)
bd = re.findall(findBd,item)[0] #概要
bd = re.sub('\s',"",bd)
bd = re.sub(r'/',"",bd)
bd = re.sub('<br>',"",bd)
data.append(bd.strip())
datalist.append(data) #将data列表装入datalist列表
return datalist
#用来得到一个指定URL的网页内容(豆瓣top250)
def askURL1(url):
#伪装为浏览器
head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36 Edg/99.0.1150.55"
}
resquest = urllib.request.Request(url,headers=head)
html = ""
try:
response = urllib.request.urlopen(resquest)
html = response.read().decode("utf-8")
#print(html)
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
return html
def saveData1(datalist,savepath):
print("save...")
book = xlwt.Workbook(encoding="utf-8",style_compression=0)
sheet = book.add_sheet("豆瓣电影Top250",cell_overwrite_ok=True)
col = ("电影详情链接","图片链接","影片中文名字","影片外国名字","评分","评价数","相关信息")
for i in range(0,7):
sheet.write(0,i,col[i])
for i in range(0,250):
print("%d"%(i+1))
data = datalist[i]
for j in range(0,7):
sheet.write(i,j,data[j])
book.save("student.xls")
def saveData2DB1(datalist,dbpath):
init_db1(dbpath)
conn = sqlite3.connect(dbpath)
cur = conn.cursor()
for data in datalist:
for index in range(len(data)):
if index == 4 or index == 5:
continue
data[index] = '"'+data[index].strip()+'"'
sql = '''
insert into movie250(info_link,pic_link,cname,ename,score,rated,info)
values(%s)'''%",".join(data)
cur.execute(sql)
conn.commit()
cur.close()
conn.close()
def init_db1(dbpath):
sql = '''
create table movie250
(
id integer primary key autoincrement,
info_link text,
pic_link text,
cname varchar ,
ename varchar ,
score numeric ,
rated numeric ,
info text
)
'''
conn = sqlite3.connect(dbpath)
cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
conn.close()
def askURL2(url):
# 伪装为浏览器
head = {
"User-Agent":" Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36 Edg/100.0.1185.36"
}
resquest = urllib.request.Request(url, headers=head)
html = ""
try:
response = urllib.request.urlopen(resquest)
html = response.read().decode("utf-8")
print(html)
except urllib.error.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
return html
def getData2(baseurl):
datalist = [] #将获得的数据装于datalist这个列表
html = askURL1(baseurl)
soup = BeautifulSoup(html,"html.parser") #解析
for item in soup.find_all('li',class_= "list-item"):
print(item) #一个测试点
data = [] #将每一部电影的具体信息装入data列表
item = str(item)
link = re.findall(findnowmovielink,item)[0]
data.append(link)
img = re.findall(findnowmovieimg,item)[0]
data.append(img)
title = re.findall(findnowmovietitle,item)[0]
data.append(title)
director = re.findall(findnowmoviedirector, item)[0]
data.append(director)
actors = re.findall(findnowmovieactors,item)[0]
data.append(actors)
time = re.findall(findnowmovietime, item)[0]
data.append(time)
region = re.findall(findnowmovieregion, item)[0]
data.append(region)
id = re.findall(findnowmovieid,item)[0]
data.append(id)
datalist.append(data) #将data列表装入datalist列表
print(datalist)
return datalist
def saveData2(datalist,savepath):
print("save...")
book = xlwt.Workbook(encoding="utf-8",style_compression=0)
sheet = book.add_sheet("豆瓣电影Top250",cell_overwrite_ok=True)
col = ("电影详情链接","图片链接","影片中文名字","导演","演员","时长","制片地区","电影id")
for i in range(0,8):
sheet.write(0,i,col[i])
for i in range(len(datalist)):
print("%d"%(i+1))
data = datalist[i]
for j in range(0,8):
sheet.write(i,j,data[j])
book.save("citymovie.xls")
if __name__ == "__main__":
main()
print("爬取完成!")