From d737bee5694c22b35dea6c35ef67a799fd2ad598 Mon Sep 17 00:00:00 2001 From: qweasdzxc227 <1095578930@qq.com> Date: Tue, 28 May 2024 22:01:02 +0800 Subject: [PATCH] 5.28.22.00 --- ArticleSpider/.idea/ArticleSpider.iml | 14 +- .../__pycache__/items.cpython-39.pyc | Bin 1399 -> 2756 bytes .../__pycache__/pipelines.cpython-39.pyc | Bin 6158 -> 6549 bytes .../__pycache__/settings.cpython-39.pyc | Bin 1211 -> 1213 bytes ArticleSpider/ArticleSpider/items.py | 150 ++++++++++-------- .../__pycache__/__init__.cpython-39.pyc | Bin 0 -> 152 bytes .../__pycache__/es_types.cpython-39.pyc | Bin 0 -> 1649 bytes .../__pycache__/jobbole.cpython-39.pyc | Bin 3539 -> 3539 bytes ArticleSpider/main.py | 22 ++- 9 files changed, 103 insertions(+), 83 deletions(-) create mode 100644 ArticleSpider/ArticleSpider/models/__pycache__/__init__.cpython-39.pyc create mode 100644 ArticleSpider/ArticleSpider/models/__pycache__/es_types.cpython-39.pyc diff --git a/ArticleSpider/.idea/ArticleSpider.iml b/ArticleSpider/.idea/ArticleSpider.iml index bbec2ce..0cb82f7 100644 --- a/ArticleSpider/.idea/ArticleSpider.iml +++ b/ArticleSpider/.idea/ArticleSpider.iml @@ -1,8 +1,8 @@ - - - - - - - + + + + + + + \ No newline at end of file diff --git a/ArticleSpider/ArticleSpider/__pycache__/items.cpython-39.pyc b/ArticleSpider/ArticleSpider/__pycache__/items.cpython-39.pyc index d053818a0511f8bb96b010ff659459d137282da2..f34a24f54087e2a87ab795e690d80c891f845ffb 100644 GIT binary patch literal 2756 zcmZ`*&2!u|6qjt->-BzQlR`;5l<6?cz!tisbePgo+Cn>J%116O7f(7KcctBU>yNFZ zO`44NlwNz_0v8U+kqdv2uAF+{#ElEQCvTdNfIWKpy(d|IdS6n~>4XHHpML&f^hb-3 z-;p`~FktRLQ@;V>gwu>fbx$J-ypuV(8@V})Snfq$?ngewaW`w^K@?cbvSuDep~YU- z%G*)fVn6HT-Ka~6=YwbFIG8yNczjKKxeYvXD0kUncgk)X{m&1K1!a7`>9mg1P{0< zbgFj@`=(5bJ~iP3E%NWmgbN8=>r1KAVJ05!Oa-!1KPdcHT1;o!_}d~K zZ|c5pTqU$|gfb1-XgWl&*sAf9=~NWlFkGnLlxN^)QJ5B^GS;(cCZcer1h=+RQE;e$ zVOl)b@*+$&9@CpNU+1)#}3n7kqf&YV_Mp0DV>!yyD7LivCOp6avp-EeT}_6HmM!@;G#T!pW8 z!zUqm@$y6uH;p%zUFo6}vQk>ccPqc_3eYb?v@57oZ z077sNpuhYW@c$?D3XC2mv~c%G#7?C3Q3+Y^dlHH&kyr#|2Z;;Bta33*W|@x5nFfEy zQ(1y*Re3z1Y@%aS$u%ITQk}EZ@~6~$U8i5b%p>*o*#U_>;ZB(FMFS)JC5-|BSqJjw zq7UtOYY(b9jJ6lj&Z6%w`lUtRv$L1E3%L%N4_~Y-M$Xh(5B0hMt_#e{H|3MB%1kUO z6WRqbXvxO-(VjunJ86f%=tm%KDuLOoOL;1eFp+H>;jv`P3B(bn=OC41M2)TTuOeGS-UEqeB|aeEq6Q!+S};EV6j;;*)U+rB6k5~*)Iv0A#~t0(WU@5r z0Y`azN~$vfCx{dni5QNi0r9BeI5JBk2`EnU zWGv#D%uMgNV4CPn)0)a8RU$5CQ0jtFvQ?utl%5=xs5uwEWq4Sb4#2p4GJ(@7PWdZ5 z1zus5@1K#_$!kbHLxPDQuOqpELL(AeHy~sg%((Z9KLOO5hS}8 zPRH2bXr1-`n5lR4ry}F^&hcolSzp`0ZtzQ#VOE_qgtdXS1_q%uL~XEd zk$rtW*T^RDCopgonmu&_(3(;IZ{MvETcDZ?kaHw{(LX0I;fmh@iB@Z^2bsN^3|rjb)MNkh%SgMQE&xP zB9X(*n5%4X?9wCgvS5}ML3}hI(S637sk9hNEcAw9?%T_Tjg;EfSl8@M|I275xLMzU aZsCp;7JjtQD)`;LPS;)WgXJLTv3~$xh@MFR delta 539 zcmZutyGjHx6rJoQGnvN@t}hg|Q7jeF?+K33tIEk{$&W_y43W8Qz*e>}6 zKfulp&_A$CVec2%dNT@I7|5A>a!<~jo7b_=AWmwvz`^zKa&X>cgLs;#!NT&!rX#3e z_l{tabo{&ONg!O|-8&~CcDV4d<4ca`5%NHU$ip9HO^_p}5e?qg^cHEmA(}&|pUFzy z!_v~Cg21cb=~H@G`5^jh2!zH%8%)AKox-kwS8LZCJA{ zyTUNlR$U1;JdD_Wn1Zw^ZB$e3L)X+0TKQQ=rmEEzKvAW&x3m+bd6vnZYW4bAVfcA+ zRaU^P4OPiSVLY5wp#=ULVjOidxQkhx`|~ka-iRT_4@~JtzqwDhmCV4taG9+fvy}bk bXZUKCi+)$G+W>{N;e?Y35>7%Q6yCMH_Qt>KIF1vNG)+n>Ic=SWKp`YWWX&b>`an{UE9p8 zNmD8;)JSkb3ZtSTE^y2ViHd7EbKwG~y>aRVZk!SC%{t0Y6I=7`y!U3mdGF1ew_jg= zIFYSpGD!t~zdv1Gf1`Id+pqq;d*M>cOPC2&Aqvr4-PGl?!|Z@(+)a8ZGbL>aH|=H2 zOnY6{%z|BabDm)uQrF?;y-u@JRb~~EB&i*Rq-fzm)a(K|O)?;7q+A3!OL8FRq})9_ zqY#7SL6Mh=5-jK>1&|9;?g6=r6hSW1C=m97+)YX#m!$nUqAe@sp51@P2g_=ZdY$=> z?b4O)23Fb)7m_X5r*5MD%h3E>LD%K-aFi!~5>@e0l+5ne?&;!i?h zbD&%iKWFF22`3$-WX?V6Z`KLBj_=E}$0+2Rhv+1+tllZG;;tN3PzBM5n#}5&9%EBb z$A9z=@o77^2Pq9S*Qrr{oBtnLsR#qNjuITiyaa}iVemMB9?_#v`%4h*b+Oa= z+2DJid>hJs8J_F|fU?Hk$2rEy65>o@@YJl_6E_QFUd9w-Yp~ul*5SJIO^e=b)S>az zHbR=)$aM$dXl{eF49yDTY>8OcovC(;BwZ@xu^`tT%RKwOjQj!UWj9hpuB%7P<!VCI@~TBx)6`+eju81uX)~VpX;`7q0wU#l5_|6CH3`I#AJ7Q@L(I+4zn1D1)UDso=?eF;*0Lt$zzVXz;-B22ksFtd+bHIuxG zUxvB`6?|HPeSd3lEngg8WCgIE7=Z_dZ()QHL>S8DJ`pd?x6Fa| zhiX}te8sC1>hcQZmJeSp&NSp)LAV7FB)2A9XRWg7doIJ%G7WCCYGb<+77S&aC2%n| zW3$uWayX>BLTf%{&=omlIvR2}wwK?1hpbiXmh9qOHtHZ=U;Btc*TzpB!|w*ew!!co su~h^-3F-VvQ3m=nINx;XYq$r;j*?cxKQtUUlNd8JW5Ad&vhdIS4|pf0=l}o! delta 1890 zcmZ`(-)|d55cb)3_W9TO96KRtlQboy;S!QG4GoECD^x*JEK(X$Kpd3A@m|P9=bV{4 zlSV=WM0f!S6s?dzLP7!wsm~P;yzqp?|FAsq2UI*E@yN`cBPS-cwcp;%?0!2lJ3HSj ze0ABV7mK+B{*+(uY<=2)WDG06A5G63*lCtvs-LklIBUM%%-UHcA$2)BhjZG`Hw$(_ z>N0+@Y1oF6SWPgUWe*c9>y|!C*(S<4mPa`+3RSXDl(%qzFKul$lG-yz2*lQpHvAvvd5{Kw>_DO0LXLy1Hk zp@Jh!BIpX}eD#SDUP?V0Es!~;qc2)5bjR^_y~uHf9-dP#rshUTnQ`yCjdtV;Eo!%X z7e-KNryD`D=|xr=Bu|kj48D_@sGg<-@@$mOj?6lyzkL8M#t zFx+D>gmydMXhRr4!$%#QBfj$pmJ&~@P?^_=AVY>US97!DXPhaDTM#sz(2XK*_x*60 zzn(kPrNPn}2>G70U2whQ~co>Nk8@s<2r*E-Q^E%0_DOO?Tj%vRm=+u%Z+ zbn>Vt=eJO|_VCaA$H{|>WPXxhoZu;f34%)mlLSu_CS)UPbbI{%?>nm;OiIdtLl2Dx?&bvyjX{Py}yvTh&QZkT3O}XY&cH{! zR(PvJ zzvzEZmJfnnicnkiXd5Q^ivtq_HB?x7=iz}Hw*4rCB0m_gE_9EeY7hqb>|k5O9vQjp zbNW!Xj^>E>djC5VAqu~(8ahoRD8sKY#wQ*RdO>ZFBX4rswYk&6m;zD_maGw7? zICwy1V3*eRH`ZTao3n8@3uVI+>0oom#qOmO6LnUghXPbVKuG~Q?kvBAeytg>w(s7c at>G|2L7_i(WQq8lGBtC=Tu@EJH2wo&vuFMQ diff --git a/ArticleSpider/ArticleSpider/__pycache__/settings.cpython-39.pyc b/ArticleSpider/ArticleSpider/__pycache__/settings.cpython-39.pyc index b018caaaf1d9c6559e4fa6b6161a6e3aee8e0988..22bc2d03528414b60d83b048b86fe85477c1b37d 100644 GIT binary patch delta 97 zcmdnZxtEhSk(ZZ?0SFi$1gCx6$UBXZmkr3v)L~>`&}6F8oxFijLfkbcvA85Nxi~ek qC^;h_vmiAmGcOe)xA`mMdM0sUCPt>8Yzi!lK*%h@$j1a`F#-VF#urQg delta 93 zcmdnXxto(Wk(ZZ?0SI313`pCyk#`y+FDsCjsl&*?pvhFFJ$VD8gs^XAabZqKd1i4* qYDz$6K~82~>g08dQk%asu4fV#Vq#?a#U{_f2!zZcjC@RB79#+a#TI-3 diff --git a/ArticleSpider/ArticleSpider/items.py b/ArticleSpider/ArticleSpider/items.py index 902ba99..5076fd8 100644 --- a/ArticleSpider/ArticleSpider/items.py +++ b/ArticleSpider/ArticleSpider/items.py @@ -1,64 +1,86 @@ -# Define here the models for your scraped items -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/items.html -import re -import scrapy -from scrapy.loader.processors import MapCompose, TakeFirst, Identity, Join -from scrapy.loader import ItemLoader -from ArticleSpider.models.es_types import ArticleType -from w3lib.html import remove_tags - -class ArticlespiderItem(scrapy.Item): - # define the fields for your item here like: - # name = scrapy.Field() - pass - - -def date_convert(value): - match_re = re.match('.*?(\d+.*)', value) - if match_re: - return match_re.group(1) - else: - return '1970-07-01' - - -class ArticleItemLoader(ItemLoader): - default_output_processor = TakeFirst() - - -class JobBoleArticleItem(scrapy.Item): - title = scrapy.Field() # 标题 - create_date = scrapy.Field( - input_processor=MapCompose(date_convert) - ) # 发布时间 - url = scrapy.Field() # 链接 - url_object_id = scrapy.Field() # 链接id - front_image_url = scrapy.Field( - output_processor=Identity() - ) # 封面图 - front_image_path = scrapy.Field() # 封面图路径 - praise_nums = scrapy.Field() # 点赞数 - comment_nums = scrapy.Field() # 评论数 - fav_nums = scrapy.Field() # 收藏数 - tags = scrapy.Field( - output_processor=Join(separator=',') - ) # 标签 - content = scrapy.Field() # 内容 - - def save_to_es(self): - article = ArticleType() - article.title = self['title'] - article.create_date = self['create_date'] - article.content = remove_tags(self['content']) - article.front_image_url = self['front_image_url'] - if 'front_image_path' in self: - article.front_image_path = self['front_image_path'] - article.praise_nums = self['praise_nums'] - article.fav_nums = self['fav_nums'] - article.comment_nums = self['comment_nums'] - article.url = self['url'] - article.tags = self['tags'] - article.meta.id = self['url_object_id'] - article.save() - return \ No newline at end of file +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html +import re +import scrapy +from scrapy.loader.processors import MapCompose, TakeFirst, Identity, Join +from scrapy.loader import ItemLoader +from ArticleSpider.models.es_types import ArticleType +from w3lib.html import remove_tags +from elasticsearch_dsl.connections import connections + +es = connections.create_connection(ArticleType._doc_type.using) + + +class ArticlespiderItem(scrapy.Item): + # define the fields for your item here like: + # name = scrapy.Field() + pass + + +def gen_suggests(index, info_tuple): + # 根据字符串生成字符串搜索建议数组 + used_words = set() # 去重 + suggests = [] + for text, weight in info_tuple: + if text: + # 调用es的analyze接口分析字符串 + words = es.indices.analyze(index=index, analyzer='ik_max_word', params={'filter': ['lowercase']}, body=text) + anylyzed_words = set([r['token'] for r in words['tokens'] if len(r['token']) > 1]) + new_words = anylyzed_words - used_words + else: + new_words = set() + if new_words: + suggests.append({'input': list(new_words), 'weight': weight}) + return suggests + + +def date_convert(value): + match_re = re.match('.*?(\d+.*)', value) + if match_re: + return match_re.group(1) + else: + return '1970-07-01' + + +class ArticleItemLoader(ItemLoader): + default_output_processor = TakeFirst() + + +class JobBoleArticleItem(scrapy.Item): + title = scrapy.Field() # 标题 + create_date = scrapy.Field( + input_processor=MapCompose(date_convert) + ) # 发布时间 + url = scrapy.Field() # 链接 + url_object_id = scrapy.Field() # 链接id + front_image_url = scrapy.Field( + output_processor=Identity() + ) # 封面图 + front_image_path = scrapy.Field() # 封面图路径 + praise_nums = scrapy.Field() # 点赞数 + comment_nums = scrapy.Field() # 评论数 + fav_nums = scrapy.Field() # 收藏数 + tags = scrapy.Field( + output_processor=Join(separator=',') + ) # 标签 + content = scrapy.Field() # 内容 + + def save_to_es(self): + article = ArticleType() + article.title = self['title'] + article.create_date = self['create_date'] + article.content = remove_tags(self['content']) + article.front_image_url = self['front_image_url'] + if 'front_image_path' in self: + article.front_image_path = self['front_image_path'] + article.praise_nums = self['praise_nums'] + article.fav_nums = self['fav_nums'] + article.comment_nums = self['comment_nums'] + article.url = self['url'] + article.tags = self['tags'] + article.meta.id = self['url_object_id'] + article.suggest = gen_suggests(ArticleType._doc_type.index, ((article.title, 10), (article.tags, 7))) + article.save() + return diff --git a/ArticleSpider/ArticleSpider/models/__pycache__/__init__.cpython-39.pyc b/ArticleSpider/ArticleSpider/models/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..412863890a96bb140bc0d79beeb0d1743dd8b0b2 GIT binary patch literal 152 zcmYe~<>g`kf~;SGX(0MBh(HF6K#l_t7qb9~6oz01O-8?!3`HPe1o6w(*(xTqIJKxa z#?Zjh)HKGis3bEvCpEYrGbObMjhCCBlA2Q-6Ca2$|4!FJfD z*Y36d04I<*z`taUocI^Gz_ZirrtRX$zq}9o+5WxvhWC1TfaBjkk7qx}LGYJTwg=jR zL!A0|1QF0c5D`{H5=w}ZqbigUjbuz?nb1VGXiKIvl^M-so3>?6bJ?LC*`-~-KCXJQ zPX}I3s-Yaw5eZ;)z9U+5BGPM&`gI^O(S99>HVm#J+K0Z#MF+jz>ySP`Z%_2m+lL6f zW7rjgm!bL%#=q?S8erVvaWEO#+`lf|O}9sg${|iYMu314 zBtLZIzYb_5B79`5`sgYM!e%(x%_E%JrLGKcA({T+NQgC5Gqr@AiLKA2S%m7L$WLE)gLyp+- z*!9@+*vGOB=+NWHts(SiQ2<3YTv;N4Sk$xJM}YD$7Vr)JLdICwq6oDffBtso1m?P4#j<$EH(V=SoazDo}}ERw%%p zWUgCP$62DKs%#H|HNFj5DePdTn%b~Z@;NZ)A8s2K+?=T%?xmfD;-v;wFQvBKqLC7C zo7b`#|7k77hR?N4u{#ZV{UhG>VD$k;bR6l22w9kuoV4!z&=&f2u9hV)YsRLNeOJzW zY0etOm~sV1M^JH%t)_vi;*1GhDc7o=0@ZR->cHo|?IcH`r^7S0e%!-*K@K*(=&PR& z*zMkwyILZ!s06-eS#7EJ@BNQk%SzQ&SWyDGO4;1R|J0gyH1pobrsuli9gc1dwb8 zNf}MH=6cAg4P@S$tjHZ94Yr5#7E4xfe%>wCoczR;;#({wsTCzfx|64HYXJ?}&n<@R pg~<=N>r^!1RvCd<_8Pn|Rb2T_>;NxsT)qsMJheD*!?}NLv5^ delta 332 zcmcaCeOa0}k(ZZ?0SL;h1JiDA