You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
|
|
|
|
import pandas as pd
|
|
|
|
|
import requests
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
import random
|
|
|
|
|
from time import sleep
|
|
|
|
|
from tqdm import tqdm
|
|
|
|
|
|
|
|
|
|
def get_page(url):
|
|
|
|
|
n = 3
|
|
|
|
|
while True:
|
|
|
|
|
try:
|
|
|
|
|
# sleep(random.uniform(1, 2)) # 随机出现1-2之间的数,包含小数
|
|
|
|
|
headers = {
|
|
|
|
|
'Cookie': 'OCSSID=4df0bjva6j7ejussu8al3eqo03',
|
|
|
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
response = requests.get(url, headers=headers, timeout=20)
|
|
|
|
|
# print(response.text)
|
|
|
|
|
return response.text
|
|
|
|
|
except (TimeoutError, Exception):
|
|
|
|
|
n -= 1
|
|
|
|
|
if n == 0:
|
|
|
|
|
print('请求3次均失败,放弃此url请求,检查请求条件')
|
|
|
|
|
return
|
|
|
|
|
else:
|
|
|
|
|
print('请求失败,重新请求')
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
lists = ['']
|
|
|
|
|
|
|
|
|
|
df = pd.read_csv( 'cai.csv',encoding="utf-8",names=['名称','链接','素材','评分','作者','图片','说明']) #gb18030
|
|
|
|
|
|
|
|
|
|
zuofa = []
|
|
|
|
|
for i in tqdm(df['链接']):
|
|
|
|
|
sleep(3)
|
|
|
|
|
# 发送HTTP请求获取网页内容
|
|
|
|
|
|
|
|
|
|
url = i # 替换为目标网站的URL
|
|
|
|
|
|
|
|
|
|
html_content = get_page(i)
|
|
|
|
|
|
|
|
|
|
# 解析HTML内容
|
|
|
|
|
soup = BeautifulSoup(html_content, 'html.parser')
|
|
|
|
|
|
|
|
|
|
# 定位和提取数据
|
|
|
|
|
recipe_elements = soup.find_all('div', class_='step') # 替换为目标网站上菜品信息的标签和类名
|
|
|
|
|
a = ''
|
|
|
|
|
try:
|
|
|
|
|
element = recipe_elements[0].text
|
|
|
|
|
a = a.join(element)
|
|
|
|
|
a = a.replace("\n", " ")
|
|
|
|
|
a = a.replace("\r", " ")
|
|
|
|
|
a = a.replace(" ", "")
|
|
|
|
|
zuofa.append(a)
|
|
|
|
|
except:
|
|
|
|
|
zuofa.append(a)
|
|
|
|
|
|
|
|
|
|
df['做法']=zuofa
|
|
|
|
|
|
|
|
|
|
df.to_csv('菜谱1.csv', index=False)
|
|
|
|
|
|