|
|
|
|
import random
|
|
|
|
|
import re
|
|
|
|
|
|
|
|
|
|
import jieba as jieba
|
|
|
|
|
import pymysql
|
|
|
|
|
from django.http import HttpResponse
|
|
|
|
|
from django.shortcuts import render
|
|
|
|
|
from django.views.decorators.clickjacking import xframe_options_sameorigin
|
|
|
|
|
from jinja2 import Environment, FileSystemLoader
|
|
|
|
|
from pyecharts.faker import Faker
|
|
|
|
|
from pyecharts.globals import CurrentConfig
|
|
|
|
|
from django.http import HttpResponse
|
|
|
|
|
from pyecharts import options as opts
|
|
|
|
|
from pyecharts.charts import Bar, Pie
|
|
|
|
|
from pyecharts.charts import WordCloud
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
CurrentConfig.GLOBAL_ENV = Environment(loader=FileSystemLoader("./demo/templates"))
|
|
|
|
|
|
|
|
|
|
def home(request):
|
|
|
|
|
return render(request, 'home.html')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@xframe_options_sameorigin
|
|
|
|
|
def page_views(request, page):
|
|
|
|
|
# 解析路由,找到返回的页面
|
|
|
|
|
# current = request.path.split('/')[-1]
|
|
|
|
|
# return render(request, current + '.html')
|
|
|
|
|
return render(request, 'page' + str(page) + '.html')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def table_one(request):
|
|
|
|
|
x = []
|
|
|
|
|
xaxis = []
|
|
|
|
|
yaxis = []
|
|
|
|
|
|
|
|
|
|
conn = pymysql.connect(host='120.79.165.140',
|
|
|
|
|
user='kino',
|
|
|
|
|
passwd="student",
|
|
|
|
|
db='python_web_spider_DB')
|
|
|
|
|
cur = conn.cursor()
|
|
|
|
|
|
|
|
|
|
sql1 = "SELECT distinct 规格 from wh_xinfadi"
|
|
|
|
|
sql2 = "SELECT 规格,count(distinct 蔬菜种类) from wh_xinfadi group BY 规格"
|
|
|
|
|
cur.execute(sql1)
|
|
|
|
|
|
|
|
|
|
for sp in cur:
|
|
|
|
|
init = str(sp)
|
|
|
|
|
if '\'' in init:
|
|
|
|
|
this_kind = init.split('\'')[1]
|
|
|
|
|
else:
|
|
|
|
|
this_kind = init
|
|
|
|
|
if re.match(r'^([\u4e00-\u9fa5]+类)$', this_kind):
|
|
|
|
|
x.append(this_kind)
|
|
|
|
|
|
|
|
|
|
cur.execute(sql2)
|
|
|
|
|
for it in cur:
|
|
|
|
|
if it[0] in x:
|
|
|
|
|
xaxis.append(it[0])
|
|
|
|
|
yaxis.append(it[1])
|
|
|
|
|
c = (
|
|
|
|
|
Bar()
|
|
|
|
|
.extend_axis(xaxis)
|
|
|
|
|
.add_xaxis(xaxis)
|
|
|
|
|
.add_yaxis('北京新发地菜市', yaxis,)
|
|
|
|
|
.set_global_opts(
|
|
|
|
|
title_opts=opts.TitleOpts(
|
|
|
|
|
title="北京新发地菜市场",
|
|
|
|
|
subtitle="产品种类分析"
|
|
|
|
|
),
|
|
|
|
|
datazoom_opts=opts.DataZoomOpts(),
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
# conn.commit() # 记得提交
|
|
|
|
|
cur.close()
|
|
|
|
|
conn.close()
|
|
|
|
|
return HttpResponse(c.render_embed('北京新发地菜市场产品种类分析.html'))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def pie_views(requset):
|
|
|
|
|
data = [
|
|
|
|
|
['1990年及以前', 0],
|
|
|
|
|
['1991到1995年', 0],
|
|
|
|
|
['1996到2000年', 0],
|
|
|
|
|
['2001到2005年', 0],
|
|
|
|
|
['2006到2010年', 0],
|
|
|
|
|
['2011到2015年', 0],
|
|
|
|
|
['2016年至今', 0],
|
|
|
|
|
]
|
|
|
|
|
colors = []
|
|
|
|
|
while True:
|
|
|
|
|
color = random.choice(
|
|
|
|
|
[
|
|
|
|
|
"#c23531",
|
|
|
|
|
"#2f4554",
|
|
|
|
|
"#61a0a8",
|
|
|
|
|
"#d48265",
|
|
|
|
|
"#749f83",
|
|
|
|
|
"#ca8622",
|
|
|
|
|
"#bda29a",
|
|
|
|
|
"#6e7074",
|
|
|
|
|
"#546570",
|
|
|
|
|
"#c4ccd3",
|
|
|
|
|
"#f05b72",
|
|
|
|
|
"#444693",
|
|
|
|
|
"#726930",
|
|
|
|
|
"#b2d235",
|
|
|
|
|
"#6d8346",
|
|
|
|
|
"#ac6767",
|
|
|
|
|
"#1d953f",
|
|
|
|
|
"#6950a1",
|
|
|
|
|
]
|
|
|
|
|
)
|
|
|
|
|
if color not in colors:
|
|
|
|
|
colors.append(color)
|
|
|
|
|
if len(colors) == len(data):
|
|
|
|
|
break
|
|
|
|
|
conn = pymysql.connect(host='120.79.165.140',
|
|
|
|
|
user='kino',
|
|
|
|
|
passwd="student",
|
|
|
|
|
db='python_web_spider_DB')
|
|
|
|
|
cur = conn.cursor()
|
|
|
|
|
|
|
|
|
|
sql1 = "SELECT 上映年份 from wh_doubanmovie"
|
|
|
|
|
|
|
|
|
|
cur.execute(sql1)
|
|
|
|
|
for i in cur:
|
|
|
|
|
year = int(str(i).split('\'')[1])
|
|
|
|
|
if year <= 1990:
|
|
|
|
|
data[0][1] += 1
|
|
|
|
|
elif 1990 < year <= 1995:
|
|
|
|
|
data[1][1] += 1
|
|
|
|
|
elif 1995 < year <= 2000:
|
|
|
|
|
data[2][1] += 1
|
|
|
|
|
elif 2000 < year <= 2005:
|
|
|
|
|
data[3][1] += 1
|
|
|
|
|
elif 2005 < year <= 2010:
|
|
|
|
|
data[4][1] += 1
|
|
|
|
|
elif 2010 < year <= 2015:
|
|
|
|
|
data[5][1] += 1
|
|
|
|
|
elif 2015 < year:
|
|
|
|
|
data[6][1] += 1
|
|
|
|
|
|
|
|
|
|
c = (
|
|
|
|
|
Pie()
|
|
|
|
|
.add("", data)
|
|
|
|
|
.set_colors(colors)
|
|
|
|
|
# .add("", [list(z) for z in zip(Faker.choose(), Faker.values())])
|
|
|
|
|
# .set_colors(["blue", "green", "yellow", "red", "pink", "orange", "purple"])
|
|
|
|
|
.set_global_opts(title_opts=opts.TitleOpts(title=""))
|
|
|
|
|
.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
|
|
|
|
|
)
|
|
|
|
|
cur.close()
|
|
|
|
|
conn.close()
|
|
|
|
|
return HttpResponse(c.render_embed('豆瓣电影排行top250年份分布.html'))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def word_cloud(request):
|
|
|
|
|
x = []
|
|
|
|
|
data = []
|
|
|
|
|
delete = ['的','你','我','他','说','了','话','这','是','吗','吧','都',
|
|
|
|
|
'也','吗','吧','都','不','为','他们','啊','则','和','在',]
|
|
|
|
|
conn = pymysql.connect(host='120.79.165.140',
|
|
|
|
|
user='kino',
|
|
|
|
|
passwd="student",
|
|
|
|
|
db='python_web_spider_DB')
|
|
|
|
|
cur = conn.cursor()
|
|
|
|
|
|
|
|
|
|
sql1 = "SELECT comments from xjh_wangyiyun"
|
|
|
|
|
|
|
|
|
|
cur.execute(sql1)
|
|
|
|
|
txt = ''
|
|
|
|
|
for i in cur:
|
|
|
|
|
ss = str(i)
|
|
|
|
|
if '\'' in ss:
|
|
|
|
|
txt += ss.split('\'')[1]
|
|
|
|
|
ls = jieba.lcut(txt)
|
|
|
|
|
for item in ls:
|
|
|
|
|
if re.match(r'^([\u4e00-\u9fa5]+)$', item):
|
|
|
|
|
x.append(item)
|
|
|
|
|
for item in x[::]:
|
|
|
|
|
t = (item, x.count(item)*3)
|
|
|
|
|
data.append(t)
|
|
|
|
|
while item in x:
|
|
|
|
|
x.remove(item)
|
|
|
|
|
for item in data:
|
|
|
|
|
if item[1] == 0 or item[0] in delete:
|
|
|
|
|
data.remove(item)
|
|
|
|
|
data.sort(key=lambda x: x in data)
|
|
|
|
|
|
|
|
|
|
c = (
|
|
|
|
|
WordCloud()
|
|
|
|
|
.add(series_name="歌曲:Mood 的评论高频词", data_pair=data, word_size_range=[10, 70])
|
|
|
|
|
.set_global_opts(
|
|
|
|
|
title_opts=opts.TitleOpts(
|
|
|
|
|
title="网易云歌曲评论词云", title_textstyle_opts=opts.TextStyleOpts(font_size=23)
|
|
|
|
|
),
|
|
|
|
|
tooltip_opts=opts.TooltipOpts(is_show=True),
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
cur.close()
|
|
|
|
|
conn.close()
|
|
|
|
|
|
|
|
|
|
return HttpResponse(c.render_embed('网易云歌曲评论词云.html'))
|
|
|
|
|
|
|
|
|
|
# 从这里写你们的爬虫函数,例:
|
|
|
|
|
# def spider_fun(url, web_name):
|
|
|
|
|
# pass
|
|
|
|
|
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
import urllib.request,urllib.error #这里用urllib库实现requests库功能
|
|
|
|
|
import os
|
|
|
|
|
import re
|
|
|
|
|
import pandas as pd
|
|
|
|
|
|
|
|
|
|
#正则提取信息
|
|
|
|
|
#findLink = re.compile(r'href="(.*?)"') #提取网址
|
|
|
|
|
findTitle = re.compile(r'target="_blank">(.*?)</a>') #提取标题
|
|
|
|
|
findPrice = re.compile(r'<span class="highlight">(.*?)</span>') #提取价格
|
|
|
|
|
findTag = re.compile(r'/" target="_blank">(.*?)</a></div>') #提取商品类型
|
|
|
|
|
findPlace = re.compile(r'<div class="ad-item-detail">(.*?)</div>') #提取地址
|
|
|
|
|
|
|
|
|
|
def askURL(url): #访问网站获取信息
|
|
|
|
|
head = {
|
|
|
|
|
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
|
|
|
|
|
}
|
|
|
|
|
request = urllib.request.Request(url,headers=head)
|
|
|
|
|
html = ""
|
|
|
|
|
try:
|
|
|
|
|
response = urllib.request.urlopen(request)
|
|
|
|
|
html = response.read().decode('utf-8')
|
|
|
|
|
#print(html) #test
|
|
|
|
|
except urllib.error.URLError as e:
|
|
|
|
|
if hasattr(e,'code'):
|
|
|
|
|
print(e.code)
|
|
|
|
|
if hasattr(e,'reason'):
|
|
|
|
|
print(e.reason)
|
|
|
|
|
return html
|
|
|
|
|
|
|
|
|
|
def getData(baseurl): #提取需要的信息
|
|
|
|
|
datalist = []
|
|
|
|
|
for i in range(1,4): #调用访问网站函数,访问每一页的信息,这里只访问了几页
|
|
|
|
|
url = baseurl + str(i)
|
|
|
|
|
html = askURL(url)
|
|
|
|
|
|
|
|
|
|
soup = BeautifulSoup(html,"html.parser")
|
|
|
|
|
for item in soup.find_all('div',class_="media-body"): #提取信息
|
|
|
|
|
#print(item) #test
|
|
|
|
|
data = []
|
|
|
|
|
|
|
|
|
|
item = str(item)
|
|
|
|
|
title = re.findall(findTitle,item)[0]
|
|
|
|
|
link = re.findall(findPlace,item)[0]
|
|
|
|
|
price = re.findall(findPrice,item)[0]
|
|
|
|
|
tag = re.findall(findTag,item)[0]
|
|
|
|
|
|
|
|
|
|
data.append(title)
|
|
|
|
|
data.append(link)
|
|
|
|
|
data.append(price)
|
|
|
|
|
data.append(tag)
|
|
|
|
|
datalist.append(data)
|
|
|
|
|
|
|
|
|
|
return datalist
|
|
|
|
|
|
|
|
|
|
def saveData(savepath,datalist,web_name): #保存文件
|
|
|
|
|
name = ["标题","地址","价格","类型"]
|
|
|
|
|
file = pd.DataFrame(columns=name,data=datalist) #整合表头和数据
|
|
|
|
|
file.to_csv(savepath+'/lyh_tiaozaomarket.csv') #保存至当前路径,命名为xxx.csv
|
|
|
|
|
print('已保存%s信息' % web_name)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
'''---------代码开始的地方---------'''
|
|
|
|
|
|
|
|
|
|
def begin_spider(url, web_name):
|
|
|
|
|
url='https://guilin.baixing.com/ershou/?page='
|
|
|
|
|
web_name='桂林百姓网二手市场'
|
|
|
|
|
savepath = os.getcwd() #获取当前路径作为保存路径
|
|
|
|
|
datalist = getData(url)
|
|
|
|
|
saveData(savepath,datalist,web_name)
|
|
|
|
|
|
|
|
|
|
import csv
|
|
|
|
|
from lxml import etree
|
|
|
|
|
import requests
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def begin_spider(url, web_name):
|
|
|
|
|
url = "https://guilin.zbj.com/search/f/?type=new&kw=saas"
|
|
|
|
|
# 设置headers,防止UA验证,Host为要爬取的域名,通过浏览器F12获取User-Agent
|
|
|
|
|
headers = {
|
|
|
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36 Edg/90.0.818.51"
|
|
|
|
|
}
|
|
|
|
|
response = requests.get(url=url, headers=headers)
|
|
|
|
|
html = etree.HTML(response.text) #通过etree解析文本内容
|
|
|
|
|
divs = html.xpath("/html/body/div[6]/div/div/div[2]/div[6]/div[1]/div") #通过浏览器的F12获取对应页面的xpath,
|
|
|
|
|
f = open("data.csv", 'w', encoding='utf-8')
|
|
|
|
|
csv_writer = csv.writer(f)
|
|
|
|
|
for div in divs:
|
|
|
|
|
|
|
|
|
|
price_temp = div.xpath("./div/div/a[1]/div[2]/div[1]/span[1]/text()") #找到价格位置的Xpath
|
|
|
|
|
title_temp = div.xpath("./div/div/a[1]/div[2]/div[2]/p/text()")#找到项目名称位置的Xpath
|
|
|
|
|
Company_temp = div.xpath("./div/div/a[2]/div[1]/p/text()")#找到公司名字位置的Xpath
|
|
|
|
|
Address_temp = div.xpath("./div/div/a[2]/div[1]/div/span/text()")#找到公司地点位置的Xpath
|
|
|
|
|
# 以下均为对数据的处理
|
|
|
|
|
if len(price_temp) != 0 and len(title_temp) != 0 and len(Company_temp) != 0 and len(Address_temp) != 0:#为了处理一些较为特殊的数据,所以加次判断
|
|
|
|
|
price = price_temp[0].strip("¥") #去除价格的符号
|
|
|
|
|
title = "SAAS".join(title_temp) #将项目名称做美化
|
|
|
|
|
Company = Company_temp[0] #去除括号
|
|
|
|
|
Address = Address_temp[0] #去除括号
|
|
|
|
|
csv_writer.writerow([price, title, Company, Address]) #写入文件
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
begin_spider("https://guilin.zbj.com/search/f/?type=new&kw=saas", "猪八戒")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import json
|
|
|
|
|
import time
|
|
|
|
|
import requests
|
|
|
|
|
|
|
|
|
|
# 设置headers,防止UA验证,Host为要爬取的域名,通过浏览器F12获取User-Agent
|
|
|
|
|
headers = {
|
|
|
|
|
'Host': 'music.163.com',
|
|
|
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36 Edg/90.0.818.51"
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def begin_spider(page, url, web_name):
|
|
|
|
|
"""
|
|
|
|
|
获取评论信息
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
url = 'https://music.163.com/api/v1/resource/comments/R_SO_4_483671599?limit=10&offset=' + str(page) #调用网易云音乐评论区的api
|
|
|
|
|
response = requests.get(url=url, headers=headers)#获取请求
|
|
|
|
|
print(response.status_code)
|
|
|
|
|
# 将字符串转为json格式
|
|
|
|
|
result = json.loads(response.text)
|
|
|
|
|
items = result['comments'] #从一大堆的文字中提取评论区的内容,这个地方不懂得话,可以debug看一下
|
|
|
|
|
for item in items: #开始提取评论区内容
|
|
|
|
|
# 用户名
|
|
|
|
|
user_name = item['user']['nickname'].replace(',', ',') #从json串获取到用户名
|
|
|
|
|
# 评论内容
|
|
|
|
|
comment = item['content'].strip().replace('', '').replace(',', ',') #从json中获取评论内容
|
|
|
|
|
# 评论点赞数
|
|
|
|
|
praise = str(item['likedCount']) #从json中获取评论点赞数
|
|
|
|
|
# 评论时间
|
|
|
|
|
date = time.localtime(int(str(item['time'])[:10])) #从json中获取评论时间
|
|
|
|
|
date = time.strftime("%Y-%m-%d %H:%M:%S", date)
|
|
|
|
|
|
|
|
|
|
with open('test.csv', 'a', encoding='utf-8-sig') as f: #保存到文件中
|
|
|
|
|
f.write(user_name + ',' + comment + ',' + praise + ',' + date + '\n')
|
|
|
|
|
f.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
for i in range(0, 100, 20):
|
|
|
|
|
begin_spider(i,"https://music.163.com/","网易云")
|
|
|
|
|
time.sleep(1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
main()
|
|
|
|
|
|
|
|
|
|
import re # 正则表达式进行文字匹配
|
|
|
|
|
from bs4 import BeautifulSoup # 网页解析获取数据
|
|
|
|
|
import urllib.error,urllib.request
|
|
|
|
|
import xlwt # 进行excel操作
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
baseurl = "https://movie.douban.com/top250?start="
|
|
|
|
|
# 1.爬取网页
|
|
|
|
|
name= '豆瓣top250'
|
|
|
|
|
datalist = begin_spider(baseurl, name)
|
|
|
|
|
savepath = "豆瓣电影top25.xls"
|
|
|
|
|
dbpath = "movie.db"
|
|
|
|
|
# 3.保存数据
|
|
|
|
|
saveData(datalist, savepath)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 影片详情链接
|
|
|
|
|
findLink = re.compile(r'<a href="(.*?)">') # 创建正则表达式对象,表示规则(字符串模式)
|
|
|
|
|
# 影片图片
|
|
|
|
|
findImgSrc = re.compile(r'<img.*src="(.*?)"', re.S) # re.S 让换行符包含在字符中
|
|
|
|
|
# 影片片名
|
|
|
|
|
findTitle = re.compile(r'<span class="title">(.*)</span>')
|
|
|
|
|
# 影评
|
|
|
|
|
findRating = re.compile(r'<span class="rating_num" property="v:average">(.*)</span>')
|
|
|
|
|
# 评价人数
|
|
|
|
|
findJudge = re.compile(r'<span>(\d*)人评价</span>')
|
|
|
|
|
# 找到概况
|
|
|
|
|
findInq = re.compile(r'<span class="inq">(.*?)</span>')
|
|
|
|
|
# 找到影片相关内容
|
|
|
|
|
findBd = re.compile(r'<p class="">(.*?)</p>', re.S)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 爬取网页
|
|
|
|
|
def begin_spider(baseurl, a):
|
|
|
|
|
datalist = []
|
|
|
|
|
for i in range(0, 1): # 调用获取页面信息的函数,1次
|
|
|
|
|
url = baseurl + str(i * 25)
|
|
|
|
|
html = askURL(url) # 保存获取到的网页源码
|
|
|
|
|
|
|
|
|
|
# 2.逐一解析数据
|
|
|
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
|
|
|
for item in soup.find_all('div', class_="item"): # 查找符合要求的字符串,形成列表
|
|
|
|
|
# print(item) # 测试查看电影item信息
|
|
|
|
|
data = [] # 保存一部电影的所有信息
|
|
|
|
|
item = str(item)
|
|
|
|
|
|
|
|
|
|
link = re.findall(findLink, item)[0] # re库用来查找,通过正则表达式查找指定字符串
|
|
|
|
|
data.append(link) # 添加链接
|
|
|
|
|
|
|
|
|
|
imgSrc = re.findall(findImgSrc, item)[0]
|
|
|
|
|
data.append(imgSrc) # 添加图片
|
|
|
|
|
|
|
|
|
|
titles = re.findall(findTitle, item)
|
|
|
|
|
|
|
|
|
|
if len(titles) == 2:
|
|
|
|
|
ctitle = titles[0] # 添加中文名
|
|
|
|
|
data.append(ctitle)
|
|
|
|
|
otitle = titles[1].replace("/", "") # 去掉无关符号
|
|
|
|
|
data.append(otitle) # 添加外国名
|
|
|
|
|
else:
|
|
|
|
|
data.append(titles[0])
|
|
|
|
|
data.append(' ') # 外国名留空
|
|
|
|
|
|
|
|
|
|
rating = re.findall(findRating, item)[0]
|
|
|
|
|
data.append(rating) # 添加评分
|
|
|
|
|
|
|
|
|
|
judgeNum = re.findall(findJudge, item)[0]
|
|
|
|
|
data.append(judgeNum) # 添加评价人数
|
|
|
|
|
|
|
|
|
|
inq = re.findall(findInq, item)[0]
|
|
|
|
|
if len(inq) != 0:
|
|
|
|
|
inq = inq[0].replace("。", " ") # 去掉句号
|
|
|
|
|
data.append(inq) # 添加概况
|
|
|
|
|
else:
|
|
|
|
|
data.append(" ") # 留空
|
|
|
|
|
bd = re.findall(findBd, item)[0]
|
|
|
|
|
bd = re.sub('<br(\s+)?/>(\s+)?', " ", bd) # 去掉<br/>
|
|
|
|
|
bd = re.sub('/', " ", bd) # 替换/
|
|
|
|
|
data.append(bd.strip()) # 去掉前后空格
|
|
|
|
|
print(data)
|
|
|
|
|
datalist.append(data) # 把处理好的一部电影信息放入datalist
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return datalist
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 得到指定一个URL的网页内容
|
|
|
|
|
def askURL(url):
|
|
|
|
|
head = { # 模拟浏览器头部信息,向网址服务器发送消息
|
|
|
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit / 537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36L, like Gecko) Chrome/90.0.4430.212 Safari/537.36"
|
|
|
|
|
}
|
|
|
|
|
# 用户代理,表示告诉豆瓣服务器,我们是什么类型的机器,浏览器(本质上是告诉浏览器,我们可以接收什么水平的信息
|
|
|
|
|
request = urllib.request.Request(url, headers=head)
|
|
|
|
|
html = ""
|
|
|
|
|
# 异常处理
|
|
|
|
|
try:
|
|
|
|
|
response = urllib.request.urlopen(request)
|
|
|
|
|
html = response.read().decode("utf-8")
|
|
|
|
|
|
|
|
|
|
except urllib.error.URLError as e:
|
|
|
|
|
if hasattr((e, "code")):
|
|
|
|
|
print(e.code)
|
|
|
|
|
if hasattr(e, "reason"):
|
|
|
|
|
print(e.reason)
|
|
|
|
|
|
|
|
|
|
return html
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 保存数据
|
|
|
|
|
def saveData(datalist, savepath):
|
|
|
|
|
print("save...")
|
|
|
|
|
book = xlwt.Workbook(encoding="utf-8")
|
|
|
|
|
sheet = book.add_sheet('豆瓣电影top25')
|
|
|
|
|
col = ("电影链接", "图片链接", "影片中文名", "影片外文名", "评分", "评价数", "概况", "相关信息")
|
|
|
|
|
for i in range(0, 8):
|
|
|
|
|
sheet.write(0, i , col[i])
|
|
|
|
|
for i in range(0, 25):
|
|
|
|
|
print("第%d条"%(i+1))
|
|
|
|
|
data = datalist[i]
|
|
|
|
|
for j in range(0, 8):
|
|
|
|
|
sheet.write(i+1, j, data[j])
|
|
|
|
|
book.save(savepath)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
main()
|
|
|
|
|
print("爬取完毕!")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|