main
qweasdzxc227 6 months ago
parent d737bee569
commit 31d6d15c69

@ -50,47 +50,22 @@ class JobboleSpider(scrapy.Spider):
cookie_dict[cookie['name']] = cookie['value']
for url in self.start_urls:
yield scrapy.Request(url, dont_filter=True, cookies=cookie_dict)
# cookie_dict = {cookie['name']: cookie['value'] for cookie in cookies}
# print(cookies)
# print(cookie_dict)
# yield scrapy.Request(url='https://account.cnblogs.com/signin', callback=self.parse, cookies=cookie_dict)
def parse(self, response):
# 1.获取新闻列表页中的新闻url并交给scrapy进行下载后调用相应的解析方
# 提取文章链接,extract_first()提取第一个值
post_nodes = response.css('#news_list .news_block')[:1]
post_nodes = response.css('#news_list .news_block')[:100]
for post_node in post_nodes:
image_url = "https:" + post_node.css('.entry_summary a img::attr(src)').extract_first("")
post_url = post_node.css('h2 a::attr(href)').extract_first("")
yield Request(url=parse.urljoin(response.url, post_url), meta={'front_image_url': image_url}, callback=self.parse_detail, dont_filter=True)
# 2.获取下一页的url并交给scrapy进行下载下载完成后交给parse继续跟进
# next_url = response.css('div.pager a:last-child::attr(href)').extract_first("")
# yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse)
next_url = response.css('div.pager a:last-child::attr(href)').extract_first("")
yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse)
def parse_detail(self, response):
match_re = re.match(".*?(\d+)", response.url)
if match_re:
post_id = match_re.group(1)
# article_item = JobBoleArticleItem()
# title = response.css('#news_title a::text').extract_first("")
# create_date = response.css('#news_info .time::text').extract_first("")
# match_re = re.match('.*?(\d+.*)', create_date)
# if match_re:
# create_date = match_re.group(1)
# # create_date = response.xpath('//*[@id="news_info"]//*[@class="time"]/text()').extract_first("")
#
# content = response.css('#news_content').extract()[0]
# tag_list = response.css('.news_tags a::text').extract()
# tags = ','.join(tag_list)
# article_item['title'] = title
# article_item['create_date'] = create_date
# article_item['content'] = content
# article_item['tags'] = tags
# article_item['url'] = response.url
# if response.meta.get('front_image_url', ""):
# article_item['front_image_url'] = [response.meta.get('front_image_url', "")]
# else:
# article_item['front_image_url'] = []
item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response)
item_loader.add_css('title', '#news_title a::text')
item_loader.add_css('content', '#news_content')
@ -98,31 +73,15 @@ class JobboleSpider(scrapy.Spider):
item_loader.add_css('create_date', '#news_info .time::text')
item_loader.add_value('url', response.url)
item_loader.add_value('front_image_url', response.meta.get('front_image_url', ''))
# article_item = item_loader.load_item()
# if response.meta.get('front_image_url', ""):
# article_item['front_image_url'] = [response.meta.get('front_image_url', "")]
# else:
# article_item['front_image_url'] = []
yield Request(url=parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)),
meta={'article_item': item_loader, 'url':response.url}, callback=self.parse_nums)
# praise_nums = j_data['DiggCount']
# fav_nums = j_data['TotalView']
# comment_nums = j_data['CommentCount']
# pass
def parse_nums(self, response):
j_data = json.loads(response.text)
item_loader = response.meta.get('article_item', "")
# praise_nums = j_data['DiggCount']
# fav_nums = j_data['TotalView']
# comment_nums = j_data['CommentCount']
item_loader.add_value('praise_nums', j_data['DiggCount'])
item_loader.add_value('fav_nums', j_data['TotalView'])
item_loader.add_value('comment_nums', j_data['CommentCount'])
item_loader.add_value('url_object_id', common.get_md5(response.meta.get('url', '')))
# article_item['praise_nums'] = praise_nums
# article_item['fav_nums'] = fav_nums
# article_item['comment_nums'] = comment_nums
# article_item['url_object_id'] = common.get_md5(article_item['url'])
article_item = item_loader.load_item()
yield article_item

@ -17,8 +17,11 @@ Including another URLconf
from django.contrib import admin
from django.urls import path
from django.views.generic import TemplateView
from search.views import SearchSuggest,SearchView
urlpatterns = [
path('admin/', admin.site.urls),
path('', TemplateView.as_view(template_name='index.html'), name='index'),
path('suggest/', SearchSuggest.as_view(), name='suggest'),
path('search/', SearchView.as_view(), name='search'),
]

@ -1,3 +1,47 @@
from django.db import models
# Create your models here.
# -*- coding: utf-8 -*-
__author__ = 'bobby'
from datetime import datetime
from elasticsearch_dsl import DocType, Date, Nested, Boolean, \
analyzer, InnerObjectWrapper, Completion, Keyword, Text, Integer
from elasticsearch_dsl.analysis import CustomAnalyzer as _CustomAnalyzer
from elasticsearch_dsl.connections import connections
connections.create_connection(hosts=["localhost"])
class CustomAnalyzer(_CustomAnalyzer):
def get_analysis_definition(self):
return {}
ik_analyzer = CustomAnalyzer("ik_max_word", filter=["lowercase"])
class ArticleType(DocType):
# 伯乐在线文章类型
suggest = Completion(analyzer=ik_analyzer)
title = Text(analyzer="ik_max_word")
create_date = Date()
url = Keyword()
url_object_id = Keyword()
front_image_url = Keyword()
front_image_path = Keyword()
praise_nums = Integer()
comment_nums = Integer()
fav_nums = Integer()
tags = Text(analyzer="ik_max_word")
content = Text(analyzer="ik_max_word")
class Meta:
index = "jobbole"
doc_type = "article"
if __name__ == "__main__":
ArticleType.init()

@ -1,3 +1,88 @@
from django.shortcuts import render
from django.views.generic.base import View
from search.models import ArticleType
from django.http import HttpResponse
import json
from elasticsearch import Elasticsearch
from datetime import datetime
client = Elasticsearch(hosts=['127.0.0.1'])
# Create your views here.
class SearchSuggest(View):
# 搜索建议模块
def get(self, request):
key_words = request.GET.get('s', '')
re_datas = []
if key_words:
s = ArticleType.search()
s = s.suggest('my_suggest', key_words, completion={
"field": "suggest", "fuzzy": {
"fuzziness": 2
},
"size": 10
})
suggestions = s.execute_suggest()
for match in suggestions.my_suggest[0].options:
source = match._source
re_datas.append(source["title"])
return HttpResponse(json.dumps(re_datas), content_type="application/json")
class SearchView(View):
def get(self, request):
key_words = request.GET.get("q", '')
page = request.GET.get('p', '1')
try:
page = int(page)
except:
page = 1
start_time = datetime.now()
response = client.search(
index="jobbole",
body={
"query": {
"multi_match": {
"query": key_words,
"fields": ["tags", "title", "content"]
}
},
"from": (page - 1) * 10,
"size": 10,
"highlight": {
"pre_tags": ['<span class="keyWord">'],
"post_tags": ['</span>'],
"fields": {
"title": {},
"content": {},
}
}
}
)
end_time = datetime.now()
last_seconds = (end_time - start_time).total_seconds()
total_nums = response['hits']['total']
if (page % 10) > 0:
page_nums = int(total_nums / 10) + 1
else:
page_nums = int(total_nums / 10)
# 构造值,获取每个字段的值
hit_list = []
for hit in response['hits']['hits']:
hit_dict = {}
if 'title' in hit['highlight']:
hit_dict['title'] = "".join(hit['highlight']['title'])
else:
hit_dict['title'] = hit['_source']['title']
if 'content' in hit['highlight']:
hit_dict['content'] = "".join(hit['highlight']['content'])[:500]
else:
hit_dict['content'] = hit['_source']['content'][:500]
hit_dict["create_date"] = hit['_source']['create_date']
hit_dict["url"] = hit['_source']['url']
hit_dict["score"] = hit['_score']
hit_list.append(hit_dict)
return render(request, 'result.html',
{'page': page, 'total_nums': total_nums, 'all_hits': hit_list, 'key_words': key_words,
'page_nums': page_nums, 'total_nums': total_nums,'last_seconds':last_seconds})

@ -66,8 +66,8 @@
<script type="text/javascript" src="{% static 'js/jquery.js' %}"></script>
<script type="text/javascript" src="{% static 'js/global.js' %}"></script>
<script type="text/javascript">
var suggest_url = "/suggest/"
var search_url = "/search/"
var suggest_url = "{% url "suggest" %}"
var search_url = "{% url "search" %}"
$('.searchList').on('click', '.searchItem', function(){

@ -124,7 +124,7 @@
<script type="text/javascript" src="{% static 'js/global.js' %}"></script>
<script type="text/javascript" src="{% static 'js/pagination.js' %}"></script>
<script type="text/javascript">
var search_url = "{% url 'search' %}"
var search_url = "{% url "search" %}"
$('.searchList').on('click', '.searchItem', function(){
$('.searchList .searchItem').removeClass('current');

Loading…
Cancel
Save