import random import re import jieba as jieba import pymysql from django.http import HttpResponse from django.shortcuts import render from django.views.decorators.clickjacking import xframe_options_sameorigin from jinja2 import Environment, FileSystemLoader from pyecharts.faker import Faker from pyecharts.globals import CurrentConfig from django.http import HttpResponse from pyecharts import options as opts from pyecharts.charts import Bar, Pie from pyecharts.charts import WordCloud CurrentConfig.GLOBAL_ENV = Environment(loader=FileSystemLoader("./demo/templates")) def home(request): return render(request, 'home.html') @xframe_options_sameorigin def page_views(request, page): # 解析路由,找到返回的页面 # current = request.path.split('/')[-1] # return render(request, current + '.html') return render(request, 'page' + str(page) + '.html') def table_one(request): x = [] xaxis = [] yaxis = [] conn = pymysql.connect(host='120.79.165.140', user='kino', passwd="student", db='python_web_spider_DB') cur = conn.cursor() sql1 = "SELECT distinct 规格 from wh_xinfadi" sql2 = "SELECT 规格,count(distinct 蔬菜种类) from wh_xinfadi group BY 规格" cur.execute(sql1) for sp in cur: init = str(sp) if '\'' in init: this_kind = init.split('\'')[1] else: this_kind = init if re.match(r'^([\u4e00-\u9fa5]+类)$', this_kind): x.append(this_kind) cur.execute(sql2) for it in cur: if it[0] in x: xaxis.append(it[0]) yaxis.append(it[1]) c = ( Bar() .extend_axis(xaxis) .add_xaxis(xaxis) .add_yaxis('北京新发地菜市', yaxis,) .set_global_opts( title_opts=opts.TitleOpts( title="北京新发地菜市场", subtitle="产品种类分析" ), datazoom_opts=opts.DataZoomOpts(), ) ) # conn.commit() # 记得提交 cur.close() conn.close() return HttpResponse(c.render_embed('北京新发地菜市场产品种类分析.html')) def pie_views(requset): data = [ ['1990年及以前', 0], ['1991到1995年', 0], ['1996到2000年', 0], ['2001到2005年', 0], ['2006到2010年', 0], ['2011到2015年', 0], ['2016年至今', 0], ] colors = [] while True: color = random.choice( [ "#c23531", "#2f4554", "#61a0a8", "#d48265", "#749f83", "#ca8622", "#bda29a", "#6e7074", "#546570", "#c4ccd3", "#f05b72", "#444693", "#726930", "#b2d235", "#6d8346", "#ac6767", "#1d953f", "#6950a1", ] ) if color not in colors: colors.append(color) if len(colors) == len(data): break conn = pymysql.connect(host='120.79.165.140', user='kino', passwd="student", db='python_web_spider_DB') cur = conn.cursor() sql1 = "SELECT 上映年份 from wh_doubanmovie" cur.execute(sql1) for i in cur: year = int(str(i).split('\'')[1]) if year <= 1990: data[0][1] += 1 elif 1990 < year <= 1995: data[1][1] += 1 elif 1995 < year <= 2000: data[2][1] += 1 elif 2000 < year <= 2005: data[3][1] += 1 elif 2005 < year <= 2010: data[4][1] += 1 elif 2010 < year <= 2015: data[5][1] += 1 elif 2015 < year: data[6][1] += 1 c = ( Pie() .add("", data) .set_colors(colors) # .add("", [list(z) for z in zip(Faker.choose(), Faker.values())]) # .set_colors(["blue", "green", "yellow", "red", "pink", "orange", "purple"]) .set_global_opts(title_opts=opts.TitleOpts(title="")) .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}")) ) cur.close() conn.close() return HttpResponse(c.render_embed('豆瓣电影排行top250年份分布.html')) def word_cloud(request): x = [] data = [] delete = ['的','你','我','他','说','了','话','这','是','吗','吧','都', '也','吗','吧','都','不','为','他们','啊','则','和','在',] conn = pymysql.connect(host='120.79.165.140', user='kino', passwd="student", db='python_web_spider_DB') cur = conn.cursor() sql1 = "SELECT comments from xjh_wangyiyun" cur.execute(sql1) txt = '' for i in cur: ss = str(i) if '\'' in ss: txt += ss.split('\'')[1] ls = jieba.lcut(txt) for item in ls: if re.match(r'^([\u4e00-\u9fa5]+)$', item): x.append(item) for item in x[::]: t = (item, x.count(item)*3) data.append(t) while item in x: x.remove(item) for item in data: if item[1] == 0 or item[0] in delete: data.remove(item) data.sort(key=lambda x: x in data) c = ( WordCloud() .add(series_name="歌曲:Mood 的评论高频词", data_pair=data, word_size_range=[10, 70]) .set_global_opts( title_opts=opts.TitleOpts( title="网易云歌曲评论词云", title_textstyle_opts=opts.TextStyleOpts(font_size=23) ), tooltip_opts=opts.TooltipOpts(is_show=True), ) ) cur.close() conn.close() return HttpResponse(c.render_embed('网易云歌曲评论词云.html')) # 从这里写你们的爬虫函数,例: # def spider_fun(url, web_name): # pass from bs4 import BeautifulSoup import urllib.request,urllib.error #这里用urllib库实现requests库功能 import os import re import pandas as pd #正则提取信息 #findLink = re.compile(r'href="(.*?)"') #提取网址 findTitle = re.compile(r'target="_blank">(.*?)') #提取标题 findPrice = re.compile(r'(.*?)') #提取价格 findTag = re.compile(r'/" target="_blank">(.*?)') #提取商品类型 findPlace = re.compile(r'
(.*?)
') #提取地址 def askURL(url): #访问网站获取信息 head = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36' } request = urllib.request.Request(url,headers=head) html = "" try: response = urllib.request.urlopen(request) html = response.read().decode('utf-8') #print(html) #test except urllib.error.URLError as e: if hasattr(e,'code'): print(e.code) if hasattr(e,'reason'): print(e.reason) return html def getData(baseurl): #提取需要的信息 datalist = [] for i in range(1,4): #调用访问网站函数,访问每一页的信息,这里只访问了几页 url = baseurl + str(i) html = askURL(url) soup = BeautifulSoup(html,"html.parser") for item in soup.find_all('div',class_="media-body"): #提取信息 #print(item) #test data = [] item = str(item) title = re.findall(findTitle,item)[0] link = re.findall(findPlace,item)[0] price = re.findall(findPrice,item)[0] tag = re.findall(findTag,item)[0] data.append(title) data.append(link) data.append(price) data.append(tag) datalist.append(data) return datalist def saveData(savepath,datalist,web_name): #保存文件 name = ["标题","地址","价格","类型"] file = pd.DataFrame(columns=name,data=datalist) #整合表头和数据 file.to_csv(savepath+'/lyh_tiaozaomarket.csv') #保存至当前路径,命名为xxx.csv print('已保存%s信息' % web_name) '''---------代码开始的地方---------''' def begin_spider(url, web_name): savepath = os.getcwd() #获取当前路径作为保存路径 datalist = getData(url) saveData(savepath,datalist,web_name) def main(): url='https://guilin.baixing.com/ershou/?page=' web_name='桂林百姓网二手市场' begin_spider(url, web_name) if __name__ == "__main__": main()