From 31d6d15c6952f4395ea9f2f4bcfcb39fe78b00b9 Mon Sep 17 00:00:00 2001 From: qweasdzxc227 <1095578930@qq.com> Date: Wed, 29 May 2024 17:31:28 +0800 Subject: [PATCH] 5.29.17.31 --- .../__pycache__/settings.cpython-39.pyc | Bin 1213 -> 1211 bytes ArticleSpider/ArticleSpider/settings.py | 228 +++++++++--------- .../__pycache__/jobbole.cpython-39.pyc | Bin 3539 -> 3643 bytes .../ArticleSpider/spiders/jobbole.py | 215 +++++++---------- .../LcvSearch/__pycache__/urls.cpython-39.pyc | Bin 1067 -> 1210 bytes LcvSearch/LcvSearch/urls.py | 3 + .../search/__pycache__/models.cpython-39.pyc | Bin 175 -> 1672 bytes .../search/__pycache__/views.cpython-39.pyc | Bin 0 -> 2437 bytes LcvSearch/search/models.py | 44 ++++ LcvSearch/search/views.py | 85 +++++++ LcvSearch/templates/index.html | 4 +- LcvSearch/templates/result.html | 2 +- 12 files changed, 336 insertions(+), 245 deletions(-) create mode 100644 LcvSearch/search/__pycache__/views.cpython-39.pyc diff --git a/ArticleSpider/ArticleSpider/__pycache__/settings.cpython-39.pyc b/ArticleSpider/ArticleSpider/__pycache__/settings.cpython-39.pyc index 22bc2d03528414b60d83b048b86fe85477c1b37d..b08b48758091daa40111f2c0d22fa9b3c7b3f8b5 100644 GIT binary patch delta 91 zcmdnXxto(ak(ZZ?0SJU9hi&AZ!N|)BWM}FyGB9W|RcTM&z$hW?TUlI~6H=a8T#}j+ okXewEnU^|w9i!Cd?~Lo2#D$ocn0~S;urLB4vj`&}6PU#a0PP(VXaE2J delta 95 zcmdnZxtEhWk(ZZ?0SFi$1aIV?!N|)7WM}FyGB9W|Rq0ONz$hW^nv+;ul9^nbnpl*a n5s+DsnvUtLRz};&Y&^PB4j?U-AOg)8`^mmM9gOyq-|~u2e#@i6<_c0{wOO23g%JP%T_g7Z diff --git a/ArticleSpider/ArticleSpider/spiders/jobbole.py b/ArticleSpider/ArticleSpider/spiders/jobbole.py index 3bb86bc..fa0fcc5 100644 --- a/ArticleSpider/ArticleSpider/spiders/jobbole.py +++ b/ArticleSpider/ArticleSpider/spiders/jobbole.py @@ -1,128 +1,87 @@ -import json -import re -import os -import requests -import scrapy -import pickle -import datetime -from scrapy.http import Request -from urllib import parse -from scrapy.loader import ItemLoader -from ArticleSpider.items import ArticleItemLoader -from ArticleSpider.items import JobBoleArticleItem -from ArticleSpider.utils import common -from ArticleSpider.utils.common import get_md5 -from scrapy import signals -import time -from selenium import webdriver -from scrapy.loader import ItemLoader - - -class JobboleSpider(scrapy.Spider): - name = "jobbole" - allowed_domains = ["news.cnblogs.com"] - start_urls = ["http://news.cnblogs.com/"] - - def start_requests(self): - cookies = [] - if os.path.exists(r'C:\Users\10955\ArticleSpider\cookies\jobbole.cookie'): - cookies = pickle.load(open(r'C:\Users\10955\ArticleSpider\cookies\jobbole.cookie', 'rb')) - if not cookies: - driver = webdriver.Chrome() - driver.implicitly_wait(10) - # 进入登录网站 - driver.get('https://account.cnblogs.com/signin') - # 使点击验证码失效 - driver.execute_script("Object.defineProperties(navigator,{webdriver:{get:()=>undefined}})") - # 输入账号 - driver.find_element_by_id('mat-input-0').send_keys('包包1') - # 输入密码 - driver.find_element_by_id('mat-input-1').send_keys('qweasdzxc227') - # 点击登录 - driver.find_element_by_css_selector('.mat-button-wrapper').click() - # 点击验证码 - driver.find_element_by_xpath('//*[@id="Shape3"]').click() - time.sleep(5) - cookies = driver.get_cookies() - pickle.dump(cookies, open(r'C:\Users\10955\ArticleSpider\cookies\jobbole.cookie', 'wb')) - cookie_dict = {} - for cookie in cookies: - cookie_dict[cookie['name']] = cookie['value'] - for url in self.start_urls: - yield scrapy.Request(url, dont_filter=True, cookies=cookie_dict) - # cookie_dict = {cookie['name']: cookie['value'] for cookie in cookies} - # print(cookies) - # print(cookie_dict) - # yield scrapy.Request(url='https://account.cnblogs.com/signin', callback=self.parse, cookies=cookie_dict) - - def parse(self, response): - # 1.获取新闻列表页中的新闻url并交给scrapy进行下载后调用相应的解析方 - # 提取文章链接,extract_first()提取第一个值 - post_nodes = response.css('#news_list .news_block')[:1] - for post_node in post_nodes: - image_url = "https:" + post_node.css('.entry_summary a img::attr(src)').extract_first("") - post_url = post_node.css('h2 a::attr(href)').extract_first("") - yield Request(url=parse.urljoin(response.url, post_url), meta={'front_image_url': image_url}, callback=self.parse_detail, dont_filter=True) - # 2.获取下一页的url并交给scrapy进行下载,下载完成后交给parse继续跟进 - # next_url = response.css('div.pager a:last-child::attr(href)').extract_first("") - # yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse) - - def parse_detail(self, response): - match_re = re.match(".*?(\d+)", response.url) - if match_re: - post_id = match_re.group(1) - # article_item = JobBoleArticleItem() - # title = response.css('#news_title a::text').extract_first("") - # create_date = response.css('#news_info .time::text').extract_first("") - # match_re = re.match('.*?(\d+.*)', create_date) - # if match_re: - # create_date = match_re.group(1) - # # create_date = response.xpath('//*[@id="news_info"]//*[@class="time"]/text()').extract_first("") - # - # content = response.css('#news_content').extract()[0] - # tag_list = response.css('.news_tags a::text').extract() - # tags = ','.join(tag_list) - # article_item['title'] = title - # article_item['create_date'] = create_date - # article_item['content'] = content - # article_item['tags'] = tags - # article_item['url'] = response.url - # if response.meta.get('front_image_url', ""): - # article_item['front_image_url'] = [response.meta.get('front_image_url', "")] - # else: - # article_item['front_image_url'] = [] - item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) - item_loader.add_css('title', '#news_title a::text') - item_loader.add_css('content', '#news_content') - item_loader.add_css('tags', '.news_tags a::text') - item_loader.add_css('create_date', '#news_info .time::text') - item_loader.add_value('url', response.url) - item_loader.add_value('front_image_url', response.meta.get('front_image_url', '')) - # article_item = item_loader.load_item() - # if response.meta.get('front_image_url', ""): - # article_item['front_image_url'] = [response.meta.get('front_image_url', "")] - # else: - # article_item['front_image_url'] = [] - yield Request(url=parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)), - meta={'article_item': item_loader, 'url':response.url}, callback=self.parse_nums) - # praise_nums = j_data['DiggCount'] - # fav_nums = j_data['TotalView'] - # comment_nums = j_data['CommentCount'] - # pass - - def parse_nums(self, response): - j_data = json.loads(response.text) - item_loader = response.meta.get('article_item', "") - # praise_nums = j_data['DiggCount'] - # fav_nums = j_data['TotalView'] - # comment_nums = j_data['CommentCount'] - item_loader.add_value('praise_nums', j_data['DiggCount']) - item_loader.add_value('fav_nums', j_data['TotalView']) - item_loader.add_value('comment_nums', j_data['CommentCount']) - item_loader.add_value('url_object_id', common.get_md5(response.meta.get('url', ''))) - # article_item['praise_nums'] = praise_nums - # article_item['fav_nums'] = fav_nums - # article_item['comment_nums'] = comment_nums - # article_item['url_object_id'] = common.get_md5(article_item['url']) - article_item = item_loader.load_item() - yield article_item +import json +import re +import os +import requests +import scrapy +import pickle +import datetime +from scrapy.http import Request +from urllib import parse +from scrapy.loader import ItemLoader +from ArticleSpider.items import ArticleItemLoader +from ArticleSpider.items import JobBoleArticleItem +from ArticleSpider.utils import common +from ArticleSpider.utils.common import get_md5 +from scrapy import signals +import time +from selenium import webdriver +from scrapy.loader import ItemLoader + + +class JobboleSpider(scrapy.Spider): + name = "jobbole" + allowed_domains = ["news.cnblogs.com"] + start_urls = ["http://news.cnblogs.com/"] + + def start_requests(self): + cookies = [] + if os.path.exists(r'C:\Users\10955\ArticleSpider\cookies\jobbole.cookie'): + cookies = pickle.load(open(r'C:\Users\10955\ArticleSpider\cookies\jobbole.cookie', 'rb')) + if not cookies: + driver = webdriver.Chrome() + driver.implicitly_wait(10) + # 进入登录网站 + driver.get('https://account.cnblogs.com/signin') + # 使点击验证码失效 + driver.execute_script("Object.defineProperties(navigator,{webdriver:{get:()=>undefined}})") + # 输入账号 + driver.find_element_by_id('mat-input-0').send_keys('包包1') + # 输入密码 + driver.find_element_by_id('mat-input-1').send_keys('qweasdzxc227') + # 点击登录 + driver.find_element_by_css_selector('.mat-button-wrapper').click() + # 点击验证码 + driver.find_element_by_xpath('//*[@id="Shape3"]').click() + time.sleep(5) + cookies = driver.get_cookies() + pickle.dump(cookies, open(r'C:\Users\10955\ArticleSpider\cookies\jobbole.cookie', 'wb')) + cookie_dict = {} + for cookie in cookies: + cookie_dict[cookie['name']] = cookie['value'] + for url in self.start_urls: + yield scrapy.Request(url, dont_filter=True, cookies=cookie_dict) + def parse(self, response): + # 1.获取新闻列表页中的新闻url并交给scrapy进行下载后调用相应的解析方 + # 提取文章链接,extract_first()提取第一个值 + post_nodes = response.css('#news_list .news_block')[:100] + for post_node in post_nodes: + image_url = "https:" + post_node.css('.entry_summary a img::attr(src)').extract_first("") + post_url = post_node.css('h2 a::attr(href)').extract_first("") + yield Request(url=parse.urljoin(response.url, post_url), meta={'front_image_url': image_url}, callback=self.parse_detail, dont_filter=True) + # 2.获取下一页的url并交给scrapy进行下载,下载完成后交给parse继续跟进 + next_url = response.css('div.pager a:last-child::attr(href)').extract_first("") + yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse) + + def parse_detail(self, response): + match_re = re.match(".*?(\d+)", response.url) + if match_re: + post_id = match_re.group(1) + item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) + item_loader.add_css('title', '#news_title a::text') + item_loader.add_css('content', '#news_content') + item_loader.add_css('tags', '.news_tags a::text') + item_loader.add_css('create_date', '#news_info .time::text') + item_loader.add_value('url', response.url) + item_loader.add_value('front_image_url', response.meta.get('front_image_url', '')) + yield Request(url=parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)), + meta={'article_item': item_loader, 'url':response.url}, callback=self.parse_nums) + + def parse_nums(self, response): + j_data = json.loads(response.text) + item_loader = response.meta.get('article_item', "") + item_loader.add_value('praise_nums', j_data['DiggCount']) + item_loader.add_value('fav_nums', j_data['TotalView']) + item_loader.add_value('comment_nums', j_data['CommentCount']) + item_loader.add_value('url_object_id', common.get_md5(response.meta.get('url', ''))) + article_item = item_loader.load_item() + yield article_item diff --git a/LcvSearch/LcvSearch/__pycache__/urls.cpython-39.pyc b/LcvSearch/LcvSearch/__pycache__/urls.cpython-39.pyc index fd4412fe58e331767108e97724a0e6525ebfbe5c..8fa5b594037b28bd4a1025150a243e1c454439ed 100644 GIT binary patch delta 280 zcmZ3@v5QkXk(ZZ?0SMwRg{9fEFfcp@agYHAkmCTv#SIg+wWCrPQ&@A@b2*|oayg?o z85vSpQrJ?NvbeIio0))Y_Efeko)nH0&R)h8t`zQGW}p~PD#rqzg$yaYP(EKOCz#I< z;-|Aj@umm_GiVBKY*@`CsmXMUH#jx1C^;jzG(A1Fxa1ZWggaTC*-E{NqZlHpe~TT; zs$wq&E7!lp24VPV@=Z=*_7;uef$;UpGE>Woi&%j^C}NwuhFOD67{nBwe2dvpObEzi T5#V72K>=nSMiE8ek(ZZ?0SLP8hNewqW?*;>;vfSyAjbiSivuQVYip&lq_CzkWwB>*G&2F& zY^iKnoGI)n9KDPwoGDzr%;`)~Tq)ea44OO}H>_rwT)}L{?5D{+`6{#bWD6EmHh!Q% ZMf{VKSsVp;fJ_zv9!4N!=3x~12LN1(8j1h_ diff --git a/LcvSearch/LcvSearch/urls.py b/LcvSearch/LcvSearch/urls.py index 30e2b8a..3a20af3 100644 --- a/LcvSearch/LcvSearch/urls.py +++ b/LcvSearch/LcvSearch/urls.py @@ -17,8 +17,11 @@ Including another URLconf from django.contrib import admin from django.urls import path from django.views.generic import TemplateView +from search.views import SearchSuggest,SearchView urlpatterns = [ path('admin/', admin.site.urls), path('', TemplateView.as_view(template_name='index.html'), name='index'), + path('suggest/', SearchSuggest.as_view(), name='suggest'), + path('search/', SearchView.as_view(), name='search'), ] diff --git a/LcvSearch/search/__pycache__/models.cpython-39.pyc b/LcvSearch/search/__pycache__/models.cpython-39.pyc index 4a1703aa203a44312302667ecc102ed8d191ad99..e631c0ba5660cddfa93560be9503c393e530c517 100644 GIT binary patch literal 1672 zcmZ`(OOM+&5Ek{YWLbW1c9Rq-dMaR`FnTVEV$%&;1Z~|zQWPK-AZT&y)kc(DQcl); z-CXw<^wJ)C>!H7;2VQ&XU&yILI!?U1sSJLehdAWSd?ViNrXD`OzWHhKcj$TlSY`h~ zn|F*$|AintK}t^&N{AZ2Z%fedJXx+lUilo5?&i?(D;W0}xIwrN|YG?g9Nkr~Zo zmv&{3_T2kX*_Q)4baJa4$uS)h560&QB3=@aT;nO7Bc*V=jB`7m`T2-KaoS#G9{H*wDg|_sh zma7tD)s>0AgUzd22@^hti^jxHs|J>!@Vws1S~s4vI@uACt39xK~*f=kien`318bp;+NppmMGO6K^ewvOx`Z+;{@ij&W&)uq!bH5DIclXap^R z1xG1{SaZMX?aj#LQ=iF$VjRW>y)KClC9FQeh!1gT3yGhSl(cUB(B`}M zESDuOD#m7$2c|8~d9|#kVy^5|SbFm{4w+(1S;EmFlw4!iw-d&ME|r~BR}$5-Qf$F- zp=~8wQdb^V!ELe+?*y6dX46dr?X#P+NFUwZvG++*eLTbr-!|mELw#%yx7BcsHldyR i4^_-0Uv8)PD@#>vNtT77pOMdl%+LH@FoBo~7FiJWK09h#vDU3M`xr|Yaj0`DE!3>(rFF^`5 z8E>)W=BK3Q6#Hp1-QrBiO3X{o*Gow%VggF8WGG?*QbjD2`IwA^IBb9lOLJ1~7=dD+ HftUdR2$>o; diff --git a/LcvSearch/search/__pycache__/views.cpython-39.pyc b/LcvSearch/search/__pycache__/views.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4bfade81eb0a0b60cb05ba3062414622e4e0e635 GIT binary patch literal 2437 zcmbVO&5s;47PsrWtGXvmcF4>SK0?H<)VtC^*cC`<2tmu&0p&mth*D5(rR?dhran5h zJ7HSY5>nt8X-|8a!-O06J?&}#4#SND%6+Ak5G4HUYQzL_qDpqX_xzsUv;E%Bm5)X- z!Sk=LKREnuNXWN_IsJHGuAs@!&@sYjMQWN-YG|i&YBzN)?N(mxr@p1VDyYLWw6tGE zX$0M%8q{$bTUl5ob&@8O@R&tY$_967I((b3m?fVRmT>oun~uN_*$Di|@>|TmMkZVL z5e1o09SGhqE9cn$?HOuMaADgT5zz#ca&PZ$K|4JB)9Y&e+IZ0jSA~sKtuGzV- zEvHj1Rbfniej*Sd+TYRS3SCF%7@#kT8$=EGQ`p zS#<1Tqg*p~;zP&p{CU9QJu(m9ARq6pgD#jy>#%cA23^RKJ#r6+bn~i;RdO=y+EZEoA~dW?3lRPKP-90@WJSj_GGy-n;aKy zy{NE>Z6hcYwXaH5@zpqAEUL1|&A{_>**4G}7Hy+=qcUX~ks1;?iWZbUU?Q=czc>;*usDMzpGHUM7WME(cpd6JbX@0$IEd(&2576t z9?6(}?8eovJRN0Olh-`Ubev^%%a)a)hgo)WnOFUY^`w?lJ7l?9C zJd^Iqq;kN*4u!y?FM$ZhPDwkL`MW@cZHo-RiQ$x%qy!=yJ8TQ(aOT)!j*Q&=Lq}5C!Y8(;)z&%p4_3#-y!QkVL3oNj<` z$eun9?-;at1|V=#m~+&0q3NajwtmBTd*eo1ah=r5N|l+d29&?2@~IRC`gFLdf+_II$}0;*y^oh* zCC;19cq{*hCF-OQ9Jp@)fNOUtsx-)C(F(48`&lJ|ORRl=rs`@EYHyKGxsL7BS+lGG zA9+<}#-VuDMr9Q2h>A0a zcd#)7mbGOgo;0SWj3*=UjG-@>&Uh7<(dk&KTwu>|hiQOXf7j^RDD6>emQjg1v|)|AEc`*cPB`2*8Q~VjcyAfe#)4%z~IY4&cp!Z-9KRJ_***tHi?Y^9~LgChscFXkMM%?4_ls8@T5YV>83QgS}M5Q&80-e)`uz1d!Qt6mJ;H0IrSYw}RHuK@Z@UDb%FN<9>IBSU)Id I=#9630;`>1+5i9m literal 0 HcmV?d00001 diff --git a/LcvSearch/search/models.py b/LcvSearch/search/models.py index fd18c6e..df9c9c7 100644 --- a/LcvSearch/search/models.py +++ b/LcvSearch/search/models.py @@ -1,3 +1,47 @@ from django.db import models # Create your models here. +# -*- coding: utf-8 -*- +__author__ = 'bobby' + +from datetime import datetime +from elasticsearch_dsl import DocType, Date, Nested, Boolean, \ + analyzer, InnerObjectWrapper, Completion, Keyword, Text, Integer + +from elasticsearch_dsl.analysis import CustomAnalyzer as _CustomAnalyzer + +from elasticsearch_dsl.connections import connections + +connections.create_connection(hosts=["localhost"]) + + +class CustomAnalyzer(_CustomAnalyzer): + def get_analysis_definition(self): + return {} + + +ik_analyzer = CustomAnalyzer("ik_max_word", filter=["lowercase"]) + + +class ArticleType(DocType): + # 伯乐在线文章类型 + suggest = Completion(analyzer=ik_analyzer) + title = Text(analyzer="ik_max_word") + create_date = Date() + url = Keyword() + url_object_id = Keyword() + front_image_url = Keyword() + front_image_path = Keyword() + praise_nums = Integer() + comment_nums = Integer() + fav_nums = Integer() + tags = Text(analyzer="ik_max_word") + content = Text(analyzer="ik_max_word") + + class Meta: + index = "jobbole" + doc_type = "article" + + +if __name__ == "__main__": + ArticleType.init() diff --git a/LcvSearch/search/views.py b/LcvSearch/search/views.py index c60c790..8c31c11 100644 --- a/LcvSearch/search/views.py +++ b/LcvSearch/search/views.py @@ -1,3 +1,88 @@ from django.shortcuts import render +from django.views.generic.base import View +from search.models import ArticleType +from django.http import HttpResponse +import json +from elasticsearch import Elasticsearch +from datetime import datetime + +client = Elasticsearch(hosts=['127.0.0.1']) + # Create your views here. +class SearchSuggest(View): + # 搜索建议模块 + def get(self, request): + key_words = request.GET.get('s', '') + re_datas = [] + if key_words: + s = ArticleType.search() + s = s.suggest('my_suggest', key_words, completion={ + "field": "suggest", "fuzzy": { + "fuzziness": 2 + }, + "size": 10 + }) + suggestions = s.execute_suggest() + for match in suggestions.my_suggest[0].options: + source = match._source + re_datas.append(source["title"]) + return HttpResponse(json.dumps(re_datas), content_type="application/json") + + +class SearchView(View): + def get(self, request): + key_words = request.GET.get("q", '') + page = request.GET.get('p', '1') + try: + page = int(page) + except: + page = 1 + start_time = datetime.now() + response = client.search( + index="jobbole", + body={ + "query": { + "multi_match": { + "query": key_words, + "fields": ["tags", "title", "content"] + } + }, + "from": (page - 1) * 10, + "size": 10, + "highlight": { + "pre_tags": [''], + "post_tags": [''], + "fields": { + "title": {}, + "content": {}, + } + } + } + ) + end_time = datetime.now() + last_seconds = (end_time - start_time).total_seconds() + total_nums = response['hits']['total'] + if (page % 10) > 0: + page_nums = int(total_nums / 10) + 1 + else: + page_nums = int(total_nums / 10) + # 构造值,获取每个字段的值 + hit_list = [] + for hit in response['hits']['hits']: + hit_dict = {} + if 'title' in hit['highlight']: + hit_dict['title'] = "".join(hit['highlight']['title']) + else: + hit_dict['title'] = hit['_source']['title'] + if 'content' in hit['highlight']: + hit_dict['content'] = "".join(hit['highlight']['content'])[:500] + else: + hit_dict['content'] = hit['_source']['content'][:500] + hit_dict["create_date"] = hit['_source']['create_date'] + hit_dict["url"] = hit['_source']['url'] + hit_dict["score"] = hit['_score'] + hit_list.append(hit_dict) + return render(request, 'result.html', + {'page': page, 'total_nums': total_nums, 'all_hits': hit_list, 'key_words': key_words, + 'page_nums': page_nums, 'total_nums': total_nums,'last_seconds':last_seconds}) diff --git a/LcvSearch/templates/index.html b/LcvSearch/templates/index.html index 42d3a1c..8b08c57 100644 --- a/LcvSearch/templates/index.html +++ b/LcvSearch/templates/index.html @@ -66,8 +66,8 @@