main
qweasdzxc227 6 months ago
parent d737bee569
commit 31d6d15c69

@ -50,47 +50,22 @@ class JobboleSpider(scrapy.Spider):
cookie_dict[cookie['name']] = cookie['value'] cookie_dict[cookie['name']] = cookie['value']
for url in self.start_urls: for url in self.start_urls:
yield scrapy.Request(url, dont_filter=True, cookies=cookie_dict) yield scrapy.Request(url, dont_filter=True, cookies=cookie_dict)
# cookie_dict = {cookie['name']: cookie['value'] for cookie in cookies}
# print(cookies)
# print(cookie_dict)
# yield scrapy.Request(url='https://account.cnblogs.com/signin', callback=self.parse, cookies=cookie_dict)
def parse(self, response): def parse(self, response):
# 1.获取新闻列表页中的新闻url并交给scrapy进行下载后调用相应的解析方 # 1.获取新闻列表页中的新闻url并交给scrapy进行下载后调用相应的解析方
# 提取文章链接,extract_first()提取第一个值 # 提取文章链接,extract_first()提取第一个值
post_nodes = response.css('#news_list .news_block')[:1] post_nodes = response.css('#news_list .news_block')[:100]
for post_node in post_nodes: for post_node in post_nodes:
image_url = "https:" + post_node.css('.entry_summary a img::attr(src)').extract_first("") image_url = "https:" + post_node.css('.entry_summary a img::attr(src)').extract_first("")
post_url = post_node.css('h2 a::attr(href)').extract_first("") post_url = post_node.css('h2 a::attr(href)').extract_first("")
yield Request(url=parse.urljoin(response.url, post_url), meta={'front_image_url': image_url}, callback=self.parse_detail, dont_filter=True) yield Request(url=parse.urljoin(response.url, post_url), meta={'front_image_url': image_url}, callback=self.parse_detail, dont_filter=True)
# 2.获取下一页的url并交给scrapy进行下载下载完成后交给parse继续跟进 # 2.获取下一页的url并交给scrapy进行下载下载完成后交给parse继续跟进
# next_url = response.css('div.pager a:last-child::attr(href)').extract_first("") next_url = response.css('div.pager a:last-child::attr(href)').extract_first("")
# yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse) yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse)
def parse_detail(self, response): def parse_detail(self, response):
match_re = re.match(".*?(\d+)", response.url) match_re = re.match(".*?(\d+)", response.url)
if match_re: if match_re:
post_id = match_re.group(1) post_id = match_re.group(1)
# article_item = JobBoleArticleItem()
# title = response.css('#news_title a::text').extract_first("")
# create_date = response.css('#news_info .time::text').extract_first("")
# match_re = re.match('.*?(\d+.*)', create_date)
# if match_re:
# create_date = match_re.group(1)
# # create_date = response.xpath('//*[@id="news_info"]//*[@class="time"]/text()').extract_first("")
#
# content = response.css('#news_content').extract()[0]
# tag_list = response.css('.news_tags a::text').extract()
# tags = ','.join(tag_list)
# article_item['title'] = title
# article_item['create_date'] = create_date
# article_item['content'] = content
# article_item['tags'] = tags
# article_item['url'] = response.url
# if response.meta.get('front_image_url', ""):
# article_item['front_image_url'] = [response.meta.get('front_image_url', "")]
# else:
# article_item['front_image_url'] = []
item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response)
item_loader.add_css('title', '#news_title a::text') item_loader.add_css('title', '#news_title a::text')
item_loader.add_css('content', '#news_content') item_loader.add_css('content', '#news_content')
@ -98,31 +73,15 @@ class JobboleSpider(scrapy.Spider):
item_loader.add_css('create_date', '#news_info .time::text') item_loader.add_css('create_date', '#news_info .time::text')
item_loader.add_value('url', response.url) item_loader.add_value('url', response.url)
item_loader.add_value('front_image_url', response.meta.get('front_image_url', '')) item_loader.add_value('front_image_url', response.meta.get('front_image_url', ''))
# article_item = item_loader.load_item()
# if response.meta.get('front_image_url', ""):
# article_item['front_image_url'] = [response.meta.get('front_image_url', "")]
# else:
# article_item['front_image_url'] = []
yield Request(url=parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)), yield Request(url=parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)),
meta={'article_item': item_loader, 'url':response.url}, callback=self.parse_nums) meta={'article_item': item_loader, 'url':response.url}, callback=self.parse_nums)
# praise_nums = j_data['DiggCount']
# fav_nums = j_data['TotalView']
# comment_nums = j_data['CommentCount']
# pass
def parse_nums(self, response): def parse_nums(self, response):
j_data = json.loads(response.text) j_data = json.loads(response.text)
item_loader = response.meta.get('article_item', "") item_loader = response.meta.get('article_item', "")
# praise_nums = j_data['DiggCount']
# fav_nums = j_data['TotalView']
# comment_nums = j_data['CommentCount']
item_loader.add_value('praise_nums', j_data['DiggCount']) item_loader.add_value('praise_nums', j_data['DiggCount'])
item_loader.add_value('fav_nums', j_data['TotalView']) item_loader.add_value('fav_nums', j_data['TotalView'])
item_loader.add_value('comment_nums', j_data['CommentCount']) item_loader.add_value('comment_nums', j_data['CommentCount'])
item_loader.add_value('url_object_id', common.get_md5(response.meta.get('url', ''))) item_loader.add_value('url_object_id', common.get_md5(response.meta.get('url', '')))
# article_item['praise_nums'] = praise_nums
# article_item['fav_nums'] = fav_nums
# article_item['comment_nums'] = comment_nums
# article_item['url_object_id'] = common.get_md5(article_item['url'])
article_item = item_loader.load_item() article_item = item_loader.load_item()
yield article_item yield article_item

@ -17,8 +17,11 @@ Including another URLconf
from django.contrib import admin from django.contrib import admin
from django.urls import path from django.urls import path
from django.views.generic import TemplateView from django.views.generic import TemplateView
from search.views import SearchSuggest,SearchView
urlpatterns = [ urlpatterns = [
path('admin/', admin.site.urls), path('admin/', admin.site.urls),
path('', TemplateView.as_view(template_name='index.html'), name='index'), path('', TemplateView.as_view(template_name='index.html'), name='index'),
path('suggest/', SearchSuggest.as_view(), name='suggest'),
path('search/', SearchView.as_view(), name='search'),
] ]

@ -1,3 +1,47 @@
from django.db import models from django.db import models
# Create your models here. # Create your models here.
# -*- coding: utf-8 -*-
__author__ = 'bobby'
from datetime import datetime
from elasticsearch_dsl import DocType, Date, Nested, Boolean, \
analyzer, InnerObjectWrapper, Completion, Keyword, Text, Integer
from elasticsearch_dsl.analysis import CustomAnalyzer as _CustomAnalyzer
from elasticsearch_dsl.connections import connections
connections.create_connection(hosts=["localhost"])
class CustomAnalyzer(_CustomAnalyzer):
def get_analysis_definition(self):
return {}
ik_analyzer = CustomAnalyzer("ik_max_word", filter=["lowercase"])
class ArticleType(DocType):
# 伯乐在线文章类型
suggest = Completion(analyzer=ik_analyzer)
title = Text(analyzer="ik_max_word")
create_date = Date()
url = Keyword()
url_object_id = Keyword()
front_image_url = Keyword()
front_image_path = Keyword()
praise_nums = Integer()
comment_nums = Integer()
fav_nums = Integer()
tags = Text(analyzer="ik_max_word")
content = Text(analyzer="ik_max_word")
class Meta:
index = "jobbole"
doc_type = "article"
if __name__ == "__main__":
ArticleType.init()

@ -1,3 +1,88 @@
from django.shortcuts import render from django.shortcuts import render
from django.views.generic.base import View
from search.models import ArticleType
from django.http import HttpResponse
import json
from elasticsearch import Elasticsearch
from datetime import datetime
client = Elasticsearch(hosts=['127.0.0.1'])
# Create your views here. # Create your views here.
class SearchSuggest(View):
# 搜索建议模块
def get(self, request):
key_words = request.GET.get('s', '')
re_datas = []
if key_words:
s = ArticleType.search()
s = s.suggest('my_suggest', key_words, completion={
"field": "suggest", "fuzzy": {
"fuzziness": 2
},
"size": 10
})
suggestions = s.execute_suggest()
for match in suggestions.my_suggest[0].options:
source = match._source
re_datas.append(source["title"])
return HttpResponse(json.dumps(re_datas), content_type="application/json")
class SearchView(View):
def get(self, request):
key_words = request.GET.get("q", '')
page = request.GET.get('p', '1')
try:
page = int(page)
except:
page = 1
start_time = datetime.now()
response = client.search(
index="jobbole",
body={
"query": {
"multi_match": {
"query": key_words,
"fields": ["tags", "title", "content"]
}
},
"from": (page - 1) * 10,
"size": 10,
"highlight": {
"pre_tags": ['<span class="keyWord">'],
"post_tags": ['</span>'],
"fields": {
"title": {},
"content": {},
}
}
}
)
end_time = datetime.now()
last_seconds = (end_time - start_time).total_seconds()
total_nums = response['hits']['total']
if (page % 10) > 0:
page_nums = int(total_nums / 10) + 1
else:
page_nums = int(total_nums / 10)
# 构造值,获取每个字段的值
hit_list = []
for hit in response['hits']['hits']:
hit_dict = {}
if 'title' in hit['highlight']:
hit_dict['title'] = "".join(hit['highlight']['title'])
else:
hit_dict['title'] = hit['_source']['title']
if 'content' in hit['highlight']:
hit_dict['content'] = "".join(hit['highlight']['content'])[:500]
else:
hit_dict['content'] = hit['_source']['content'][:500]
hit_dict["create_date"] = hit['_source']['create_date']
hit_dict["url"] = hit['_source']['url']
hit_dict["score"] = hit['_score']
hit_list.append(hit_dict)
return render(request, 'result.html',
{'page': page, 'total_nums': total_nums, 'all_hits': hit_list, 'key_words': key_words,
'page_nums': page_nums, 'total_nums': total_nums,'last_seconds':last_seconds})

@ -66,8 +66,8 @@
<script type="text/javascript" src="{% static 'js/jquery.js' %}"></script> <script type="text/javascript" src="{% static 'js/jquery.js' %}"></script>
<script type="text/javascript" src="{% static 'js/global.js' %}"></script> <script type="text/javascript" src="{% static 'js/global.js' %}"></script>
<script type="text/javascript"> <script type="text/javascript">
var suggest_url = "/suggest/" var suggest_url = "{% url "suggest" %}"
var search_url = "/search/" var search_url = "{% url "search" %}"
$('.searchList').on('click', '.searchItem', function(){ $('.searchList').on('click', '.searchItem', function(){

@ -124,7 +124,7 @@
<script type="text/javascript" src="{% static 'js/global.js' %}"></script> <script type="text/javascript" src="{% static 'js/global.js' %}"></script>
<script type="text/javascript" src="{% static 'js/pagination.js' %}"></script> <script type="text/javascript" src="{% static 'js/pagination.js' %}"></script>
<script type="text/javascript"> <script type="text/javascript">
var search_url = "{% url 'search' %}" var search_url = "{% url "search" %}"
$('.searchList').on('click', '.searchItem', function(){ $('.searchList').on('click', '.searchItem', function(){
$('.searchList .searchItem').removeClass('current'); $('.searchList .searchItem').removeClass('current');

Loading…
Cancel
Save