You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
26 lines
2.0 KiB
26 lines
2.0 KiB
2 months ago
|
import requests
|
||
|
from bs4 import BeautifulSoup
|
||
|
|
||
|
|
||
|
def get_news_content(url):
|
||
|
headers = {
|
||
|
"cookie": "buvid4=7D777810-9AD0-8C21-1D6A-C607F528C7B427206-022123000-Sfw%2Bq8N2F39WtfvTG9WlSA%3D%3D; DedeUserID=352875468; DedeUserID__ckMd5=0a90e72ce13d5f80; buvid_fp_plain=undefined; is-2022-channel=1; FEED_LIVE_VERSION=V8; hit-new-style-dyn=1; enable_web_push=DISABLE; header_theme_version=CLOSE; buvid3=D0F7F1A0-043F-F252-3B6E-407C2F74F3E233731infoc; b_nut=1703867934; _uuid=10782D5106-6BD5-2F25-DB56-93210AD5B1077B97901infoc; hit-dyn-v2=1; rpdid=|(u))kkYuu|u0J'u~|)~)k)Ju; LIVE_BUVID=AUTO3717091338559729; CURRENT_QUALITY=80; fingerprint=77f845d4623f8049224f6d42350abef3; PVID=1; home_feed_column=5; browser_resolution=1897-998; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjYxNTAzOTIsImlhdCI6MTcyNTg5MTEzMiwicGx0IjotMX0.-PIHJf-y8eiCUpgjbSr80_6MMNByawusvATxL1TlWEg; bili_ticket_expires=1726150332; SESSDATA=e4358344%2C1741449782%2Cf61a4%2A92CjCRkwSuqlWD2A7212ZB1TNWsYRH10J2J7J5MA_OLtgkpVB-NiWTShPTTV1Uvij4R28SVjRCcl9yUW9sREZnQWNxMXpsQ3IxdC03QVJsMURLUTBuMjJkRVVZYjlNVUZfVUhNYjZlUEgtdDI0aVlfZnZRUjBqb01EZjFMWDAyaW9LQW9XWDJPdnBnIIEC; bili_jct=a84563cf63b9e39aae094c54f58ac264; sid=690q9qio; buvid_fp=77f845d4623f8049224f6d42350abef3; CURRENT_FNVAL=4048; b_lsid=883A9109D_191E0390D0E; bp_t_offset_352875468=975839826899107840",
|
||
|
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
|
||
|
} # 请求头伪装浏览器
|
||
|
response = requests.get(url, headers=headers)
|
||
|
soup = BeautifulSoup(response.content, 'html.parser')
|
||
|
|
||
|
# 提取新闻标题和正文
|
||
|
title = soup.find('h1').text
|
||
|
paragraphs = soup.find_all('p')
|
||
|
content = " ".join([p.text for p in paragraphs])
|
||
|
|
||
|
return title, content
|
||
|
|
||
|
|
||
|
# 示例:爬取某个媒体的新闻
|
||
|
url = "https://example.com/news/world-event"
|
||
|
title, content = get_news_content(url)
|
||
|
print(f"Title: {title}")
|
||
|
print(f"Content: {content}")
|