import requests from bs4 import BeautifulSoup import json import time import re import pandas as pd import random from urllib.parse import quote, urljoin from datetime import datetime import logging # 设置日志 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) class HuxiuSpider: def __init__(self): self.session = requests.Session() self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', } self.base_url = "https://www.huxiu.com" def random_delay(self): """随机延迟""" time.sleep(random.uniform(2, 4)) def get_with_retry(self, url, retries=3): """带重试的请求""" for attempt in range(retries): try: response = self.session.get(url, headers=self.headers, timeout=15) if response.status_code == 200: return response else: logger.warning(f"请求返回状态码 {response.status_code}: {url}") except Exception as e: logger.warning(f"请求失败 {url} (尝试 {attempt + 1}/{retries}): {e}") if attempt < retries - 1: time.sleep(2 ** attempt) return None def get_articles_from_homepage(self): """从首页获取文章""" try: logger.info("从虎嗅网首页获取文章...") response = self.get_with_retry(self.base_url) if not response: return [] soup = BeautifulSoup(response.text, 'html.parser') articles = [] # 查找文章卡片 article_selectors = [ '.article-item', '.mod-art', '.vertical-article', '.article-card', '.newsfeed-item' ] for selector in article_selectors: items = soup.select(selector) if items: logger.info(f"找到 {len(items)} 个文章项目,使用选择器: {selector}") for item in items[:20]: # 限制数量 article = self.parse_article_card(item) if article: articles.append(article) break return articles except Exception as e: logger.error(f"从首页获取文章时出错: {e}") return [] def parse_article_card(self, item): """解析文章卡片""" try: # 查找标题 title_elem = item.find('a', href=re.compile(r'/article/\d+\.html')) if not title_elem: return None title = title_elem.get_text().strip() link = title_elem.get('href') if link and not link.startswith('http'): link = urljoin(self.base_url, link) # 查找摘要 summary_elem = item.find('p', class_=re.compile(r'brief|summary|desc')) if not summary_elem: summary_elem = item.find('div', class_=re.compile(r'brief|summary|desc')) summary = summary_elem.get_text().strip() if summary_elem else "" # 查找作者和时间 author_elem = item.find(['span', 'a'], class_=re.compile(r'author|writer')) author = author_elem.get_text().strip() if author_elem else "" time_elem = item.find('span', class_=re.compile(r'time|date')) publish_time = time_elem.get_text().strip() if time_elem else "" return { 'title': title, 'link': link, 'summary': summary, 'author': author, 'publish_time': publish_time, 'source': 'homepage' } except Exception as e: logger.error(f"解析文章卡片时出错: {e}") return None def search_articles_direct(self, keywords): """直接搜索文章""" all_articles = [] for keyword in keywords: logger.info(f"直接搜索关键词: {keyword}") try: # 使用虎嗅的搜索页面 search_url = f"https://www.huxiu.com/search.html?keyword={quote(keyword)}" response = self.get_with_retry(search_url) if not response: continue soup = BeautifulSoup(response.text, 'html.parser') # 解析搜索结果 search_results = soup.find_all('div', class_=re.compile(r'article-item|search-result')) for item in search_results: article = self.parse_search_result(item, keyword) if article: all_articles.append(article) logger.info(f"关键词 '{keyword}' 找到 {len(search_results)} 个结果") self.random_delay() except Exception as e: logger.error(f"搜索关键词 '{keyword}' 时出错: {e}") continue return all_articles def parse_search_result(self, item, keyword): """解析搜索结果""" try: title_elem = item.find('a', href=re.compile(r'/article/\d+\.html')) if not title_elem: return None title = title_elem.get_text().strip() link = title_elem.get('href') if link and not link.startswith('http'): link = urljoin(self.base_url, link) # 检查标题是否包含关键词 if not any(word in title for word in ['大模型', 'LLM', '语言模型', 'GPT', 'ChatGPT', 'AI', '人工智能']): return None summary_elem = item.find('p', class_=re.compile(r'brief|summary')) summary = summary_elem.get_text().strip() if summary_elem else "" return { 'title': title, 'link': link, 'summary': summary, 'keyword': keyword, 'source': 'direct_search' } except Exception as e: logger.error(f"解析搜索结果时出错: {e}") return None def get_article_content(self, article_url): """获取文章详细内容""" try: logger.info(f"获取文章内容: {article_url}") response = self.get_with_retry(article_url) if not response: return self.get_empty_content() soup = BeautifulSoup(response.text, 'html.parser') # 查找文章内容区域 content_selectors = [ '.article-content-wrap', '.article-content', '.article-detail-content', '.article-main-content', '.content' ] content = "" for selector in content_selectors: content_elem = soup.select_one(selector) if content_elem: # 清理脚本和样式 for tag in content_elem(['script', 'style', 'nav', 'footer', 'aside']): tag.decompose() content = content_elem.get_text().strip() content = re.sub(r'\s+', ' ', content) if len(content) > 200: break # 如果没找到内容,尝试其他选择器 if not content or len(content) < 200: # 尝试获取所有段落 paragraphs = soup.find_all('p') content = ' '.join([p.get_text().strip() for p in paragraphs if len(p.get_text().strip()) > 20]) content = re.sub(r'\s+', ' ', content) main_points = self.extract_main_points(content) return { 'full_content': content, 'main_points': main_points, 'content_length': len(content), 'source_method': 'web' } except Exception as e: logger.error(f"获取文章内容时出错 {article_url}: {e}") return self.get_empty_content() def get_empty_content(self): """返回空内容结构""" return { 'full_content': "无法获取内容", 'main_points': "无法提取主要观点", 'content_length': 0, 'source_method': 'failed' } def extract_main_points(self, content): """提取主要观点""" if not content or len(content) < 50: return "内容过短,无法提取主要观点" # 分割成句子 sentences = re.split(r'[。!?.!?]', content) sentences = [s.strip() for s in sentences if len(s.strip()) > 15] # 寻找包含关键词的句子 key_sentences = [] keyword_groups = [ ['大模型', 'LLM', '语言模型'], ['GPT', 'ChatGPT', 'OpenAI'], ['人工智能', 'AI', '智能'], ['认为', '观点', '应该', '需要', '重要', '关键'], ['趋势', '发展', '未来', '前景'] ] for sentence in sentences: for keywords in keyword_groups: if any(keyword in sentence for keyword in keywords): if sentence not in key_sentences: key_sentences.append(sentence) break # 如果关键词句子不够,取前几个有意义的句子 if len(key_sentences) < 3: key_sentences.extend([s for s in sentences if len(s) > 30][:5-len(key_sentences)]) # 去重并限制长度 unique_sentences = [] seen = set() for s in key_sentences: if s not in seen: seen.add(s) unique_sentences.append(s) main_points = "。".join(unique_sentences[:5]) + "。" # 如果内容较短,直接返回前300字符 if len(main_points) < 50: main_points = content[:300] + "..." if len(content) > 300 else content return main_points def analyze_article_themes(self, content, title): """分析文章主题""" themes = [] content_lower = content.lower() title_lower = title.lower() theme_keywords = { '技术发展': ['技术', '算法', '架构', '训练', '参数', '模型结构', 'transformer', '神经网络'], '商业应用': ['商业', '应用', '落地', '场景', '企业', '客户', '产品', '服务', '商业化'], '投资融资': ['投资', '融资', '资本', '基金', '估值', '融资', '市值', '投资'], '市场竞争': ['竞争', '市场', '对手', '领先', '优势', '市场份额', 'bat', '微软', '谷歌'], '政策监管': ['政策', '监管', '法规', '合规', '政府', '立法', '安全', '伦理'], '挑战风险': ['挑战', '风险', '问题', '困难', '局限', '不足', '缺陷', '担忧'], '未来趋势': ['趋势', '未来', '预测', '展望', '方向', '发展', '前景', '机会'], '开源生态': ['开源', '社区', '生态', '开放', '贡献', '协作', '开源'], '行业影响': ['行业', '影响', '变革', '革命', '颠覆', '创新', '改变'] } for theme, keywords in theme_keywords.items(): if any(keyword in content_lower or keyword in title_lower for keyword in keywords): themes.append(theme) # 如果没有找到主题,根据内容推断 if not themes: if len(content) > 500: themes.append('综合讨论') else: themes.append('简要报道') return themes def extract_key_data(self, content): """提取关键数据""" key_data = [] # 提取数字相关数据 patterns = [ (r'(\d+(?:\.\d+)?)[亿万]*(?:个|款|家|项|种)(?:大模型|模型)', '模型数量'), (r'参数[^\d]*(\d+(?:\.\d+)?)[亿万]*', '参数规模'), (r'投资[^\d]*(\d+(?:\.\d+)?)[亿万]*(?:元|美元)', '投资金额'), (r'市场[^\d]*(\d+(?:\.\d+)?)[亿万]*(?:元|美元)', '市场规模'), (r'增长[^\d]*(\d+(?:\.\d+)?)%', '增长率'), (r'(\d+(?:\.\d+)?)[亿万]*参数', '参数量'), (r'准确[^\d]*(\d+(?:\.\d+)?)%', '准确率') ] for pattern, label in patterns: matches = re.findall(pattern, content) for match in matches: key_data.append(f"{label}:{match}") return list(set(key_data))[:5] def run_comprehensive_crawl(self): """运行综合爬取""" logger.info("开始综合爬取虎嗅网大模型相关文章...") all_articles = [] # 方法1: 从首页获取 logger.info("方法1: 从首页获取文章") homepage_articles = self.get_articles_from_homepage() # 过滤出相关文章 relevant_articles = [] for article in homepage_articles: title = article.get('title', '').lower() if any(keyword in title for keyword in ['大模型', 'llm', '语言模型', 'gpt', 'chatgpt', 'ai', '人工智能']): relevant_articles.append(article) all_articles.extend(relevant_articles) logger.info(f"首页找到 {len(relevant_articles)} 篇相关文章") # 方法2: 直接搜索 logger.info("方法2: 直接搜索关键词") keywords = ['大模型', 'LLM', '语言模型', 'GPT', 'ChatGPT'] search_articles = self.search_articles_direct(keywords) all_articles.extend(search_articles) logger.info(f"搜索找到 {len(search_articles)} 篇文章") # 去重 unique_articles = [] seen_titles = set() for article in all_articles: title = article.get('title', '') if title and title not in seen_titles: seen_titles.add(title) unique_articles.append(article) logger.info(f"去重后剩余 {len(unique_articles)} 篇唯一文章") if not unique_articles: logger.warning("没有找到任何文章,尝试备选方案...") return self.run_fallback_crawl() # 获取详细内容 logger.info("开始获取文章详细内容...") detailed_articles = [] for i, article in enumerate(unique_articles): logger.info(f"处理第 {i+1}/{len(unique_articles)} 篇: {article['title'][:30]}...") if article.get('link'): detail = self.get_article_content(article['link']) article.update(detail) detailed_articles.append(article) self.random_delay() # 分析文章并生成洞察 logger.info("分析文章关键观点...") insights = self.generate_insights(detailed_articles) # 保存结果 self.save_results(insights) return insights def run_fallback_crawl(self): """备选爬取方案""" logger.info("启动备选爬取方案...") # 尝试获取虎嗅网的其他页面 urls_to_try = [ "https://www.huxiu.com/channel/107.html", # 科技频道 "https://www.huxiu.com/channel/101.html", # 商业频道 "https://www.huxiu.com/tag/267.html", # AI标签 "https://www.huxiu.com/tag/人工智能.html" ] all_articles = [] for url in urls_to_try: try: logger.info(f"尝试访问: {url}") response = self.get_with_retry(url) if response and response.status_code == 200: soup = BeautifulSoup(response.text, 'html.parser') # 查找文章链接 links = soup.find_all('a', href=re.compile(r'/article/\d+\.html')) for link in links[:10]: # 限制数量 title = link.get_text().strip() if any(keyword in title.lower() for keyword in ['大模型', 'llm', '语言模型', 'gpt']): article_url = urljoin(self.base_url, link.get('href')) article = { 'title': title, 'link': article_url, 'source': 'fallback' } all_articles.append(article) self.random_delay() except Exception as e: logger.error(f"访问 {url} 时出错: {e}") continue if all_articles: logger.info(f"备选方案找到 {len(all_articles)} 篇文章") # 获取内容 detailed_articles = [] for i, article in enumerate(all_articles): logger.info(f"处理备选文章 {i+1}/{len(all_articles)}: {article['title'][:30]}...") detail = self.get_article_content(article['link']) article.update(detail) detailed_articles.append(article) self.random_delay() insights = self.generate_insights(detailed_articles) self.save_results(insights) return insights return [] def generate_insights(self, articles): """生成洞察数据""" insights = [] for article in articles: if article.get('content_length', 0) > 100: content = article.get('full_content', '') title = article.get('title', '') themes = self.analyze_article_themes(content, title) key_data = self.extract_key_data(content) insight = { '标题': title, '发布时间': article.get('publish_time', '未知'), '作者': article.get('author', '未知'), '来源': article.get('source', '未知'), '主要观点': article.get('main_points', ''), '文章主题': '、'.join(themes), '关键数据': '、'.join(key_data), '内容长度': article.get('content_length', 0), '文章链接': article.get('link', ''), '获取方式': article.get('source_method', '未知'), '爬取时间': datetime.now().strftime('%Y-%m-%d %H:%M:%S') } insights.append(insight) return insights def save_results(self, insights): """保存结果到Excel""" if not insights: logger.warning("没有找到有效数据可保存") return timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") filename = f"虎嗅网大模型观点分析_{timestamp}.xlsx" try: # 创建DataFrame df = pd.DataFrame(insights) # 设置列顺序 column_order = [ '标题', '主要观点', '文章主题', '关键数据', '作者', '发布时间', '来源', '内容长度', '获取方式', '文章链接', '爬取时间' ] # 重新排列列顺序 existing_columns = [col for col in column_order if col in df.columns] other_columns = [col for col in df.columns if col not in column_order] df = df[existing_columns + other_columns] # 保存Excel with pd.ExcelWriter(filename, engine='openpyxl') as writer: df.to_excel(writer, sheet_name='大模型观点', index=False) # 自动调整列宽 worksheet = writer.sheets['大模型观点'] for column in worksheet.columns: max_length = 0 column_letter = column[0].column_letter for cell in column: try: if len(str(cell.value)) > max_length: max_length = len(str(cell.value)) except: pass adjusted_width = min(max_length + 2, 50) worksheet.column_dimensions[column_letter].width = adjusted_width logger.info(f"数据已保存到: {filename}") # 生成报告 self.generate_report(insights, timestamp) except Exception as e: logger.error(f"保存结果时出错: {e}") def generate_report(self, insights, timestamp): """生成分析报告""" report_filename = f"虎嗅网大模型分析报告_{timestamp}.txt" total_articles = len(insights) if total_articles == 0: return avg_content_length = sum(insight['内容长度'] for insight in insights) / total_articles # 统计主题分布 theme_count = {} for insight in insights: themes = insight['文章主题'].split('、') if insight['文章主题'] else [] for theme in themes: theme_count[theme] = theme_count.get(theme, 0) + 1 # 生成报告 report = f"""虎嗅网大模型相关文章分析报告 生成时间: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")} ============================================ 基本统计: - 总文章数: {total_articles} 篇 - 平均内容长度: {avg_content_length:.0f} 字符 主题分布: """ for theme, count in sorted(theme_count.items(), key=lambda x: x[1], reverse=True): percentage = (count / total_articles) * 100 report += f"- {theme}: {count} 篇 ({percentage:.1f}%)\n" report += f"\n代表性观点摘要 (前5篇):\n" for i, insight in enumerate(insights[:5]): report += f"\n{i+1}. {insight['标题']}\n" report += f" 主题: {insight['文章主题']}\n" report += f" 关键数据: {insight['关键数据']}\n" report += f" 主要观点: {insight['主要观点'][:150]}...\n" try: with open(report_filename, 'w', encoding='utf-8') as f: f.write(report) logger.info(f"分析报告已保存到: {report_filename}") except Exception as e: logger.error(f"保存报告时出错: {e}") def main(): """主函数""" logger.info("启动虎嗅网大模型文章爬虫...") spider = HuxiuSpider() try: insights = spider.run_comprehensive_crawl() if insights: logger.info(f"\n爬取完成! 成功分析 {len(insights)} 篇文章") logger.info("前3篇文章标题:") for i, insight in enumerate(insights[:3]): logger.info(f"{i+1}. {insight['标题']}") logger.info(f" 主题: {insight['文章主题']}") logger.info(f" 观点摘要: {insight['主要观点'][:100]}...") else: logger.warning("没有找到相关文章") logger.warning("建议手动访问虎嗅网确认当前可用的文章列表") except Exception as e: logger.error(f"爬虫运行出错: {e}") import traceback logger.error(traceback.format_exc()) if __name__ == "__main__": main()