From e26d2732745453e3add29226dd8c9a18d4b38a21 Mon Sep 17 00:00:00 2001 From: CriptDit <1487413334@qq.com> Date: Sun, 2 Jun 2024 18:59:21 +0800 Subject: [PATCH] commit009 --- TR/TrialRecommend/DataRequest.py | 26 ++++++++++++++----------- TR/TrialRecommend/Main.py | 8 ++++---- TR/TrialRecommend/TF_IDF.py | 33 ++++++++++++++++++++++++++++++++ 3 files changed, 52 insertions(+), 15 deletions(-) create mode 100644 TR/TrialRecommend/TF_IDF.py diff --git a/TR/TrialRecommend/DataRequest.py b/TR/TrialRecommend/DataRequest.py index cf50f49..e645994 100644 --- a/TR/TrialRecommend/DataRequest.py +++ b/TR/TrialRecommend/DataRequest.py @@ -3,12 +3,16 @@ import random from DrissionPage import ChromiumOptions from DrissionPage import ChromiumPage import time +import TF_IDF + path=r"C:\Program Files\Google\Chrome\Application\chrome.exe" ChromiumOptions().set_browser_path(path).save() class DataRequest: - def __init__(self,keyword): + def __init__(self,keyword, user): + self.user=user + self.tf_idf='false' #创建文件对象 self.success='False' self.resp = None @@ -79,13 +83,16 @@ class DataRequest: return False def sort(self): - #按照喜欢人数排序 - self.items.sort(key=lambda item:int(item['喜欢数量']),reverse=True) - for i in self.items: - if self.is_in_csv(i['文章ID']): - continue - else: - self.csv_writer.writerow(i) + if self.tf_idf=='false': + #按照喜欢人数排序 + self.items.sort(key=lambda item:int(item['喜欢数量']),reverse=True) + for i in self.items: + if self.is_in_csv(i['文章ID']): + continue + else: + self.csv_writer.writerow(i) + else: + TF_IDF.tf_idf(self.items, self.user) def close(self): self.drive.listen.stop() @@ -100,9 +107,6 @@ class DataRequest: print('数据获取成功') break - # print('[1]:',line.split(',')[1]) - # break - if __name__=='__main__': dataRequest=DataRequest('厦门旅游攻略') for i in range(1): diff --git a/TR/TrialRecommend/Main.py b/TR/TrialRecommend/Main.py index 07d057c..b24154f 100644 --- a/TR/TrialRecommend/Main.py +++ b/TR/TrialRecommend/Main.py @@ -361,11 +361,11 @@ class Mainpage: def researchPage(self): self.clear_csv_file() - rs=self.research.get() - if rs=='': + rs = self.research.get() + if rs == '': messagebox.showinfo(title='提示', message='你还没有输入任何东西!') else: - if DataRequest.DataRequest(str(self.research.get())).success=='True': + if DataRequest.DataRequest(str(self.research.get()), self.user).success == 'True': self.content = '' self.time = '' @@ -383,7 +383,7 @@ class Mainpage: self.max_notes_count = 0 self.max_iamge_count = 0 - self.data_request = DataRequest.DataRequest(str(self.research.get())) + self.data_request = DataRequest.DataRequest(str(self.research.get()), self.user) time.sleep(random.uniform(0,1)) self.init_notes_request() else: diff --git a/TR/TrialRecommend/TF_IDF.py b/TR/TrialRecommend/TF_IDF.py new file mode 100644 index 0000000..c2e4d97 --- /dev/null +++ b/TR/TrialRecommend/TF_IDF.py @@ -0,0 +1,33 @@ +import csv + +import pandas as pd +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.metrics.pairwise import cosine_similarity + +def tf_idf(items, user): + # 数据载入 + articles = items + + df = pd.DataFrame(articles) + + # 提取TF-IDF特征 + tfidf_vectorizer = TfidfVectorizer() + tfidf_matrix = tfidf_vectorizer.fit_transform(df['content']) + + # 计算余弦相似度 + cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix) + + # 根据相似度推荐 + def recommend_articles(article_id, user, cosine_sim=cosine_sim): + sim_scores = list(enumerate(cosine_sim[article_id - 1])) + sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True) + sim_scores = sim_scores[1:6] # 推荐前5个相似文章 + article_indices = [i[0] for i in sim_scores] + with open('collections/{}.csv'.format(user), 'a+', encoding='utf-8',newline='') as f: + write=csv.DictWriter(f,fieldnames=['文章ID','content','时间','收藏数量','喜欢数量','图片资源']) + for j in article_indices: + write.writerow(items[j]) + + + # 推荐与文章ID为1的旅游攻略相似的文章 + recommend_articles(1,user) \ No newline at end of file