爬虫主函数

main
hnu202310040215 4 months ago
parent 9aa7c37dc0
commit d260b45183

@ -0,0 +1,56 @@
import requests
import os
from urllib.parse import quote
from pathlib import Path
# 配置参数
KEYWORD = "工程师" # 搜索关键词
SAVE_DIR = "/root/imgs4" # 保存路径(需确保有写入权限)
NUM_IMAGES = 20 # 目标数量
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
"Referer": "https://image.baidu.com/"
}
def download_images():
# 创建保存目录
Path(SAVE_DIR).mkdir(parents=True, exist_ok=True)
# 构造请求URL编码关键词
encoded_keyword = quote(KEYWORD)
url = f"https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&word={encoded_keyword}"
# 分页请求(自动计算所需页数)
downloaded = 0
page = 0
while downloaded < NUM_IMAGES:
params = {
"pn": page * 30, # 百度默认每页30条
"rn": 30
}
try:
response = requests.get(url, headers=HEADERS, params=params, timeout=10)
response.raise_for_status()
data = response.json()
# 提取图片真实URL
for item in data.get("data", []):
if downloaded >= NUM_IMAGES:
break
if "thumbURL" in item:
img_url = item["thumbURL"]
try:
img_data = requests.get(img_url, headers=HEADERS, timeout=5).content
with open(os.path.join(SAVE_DIR, f"engineer_{downloaded+1}.jpg"), "wb") as f:
f.write(img_data)
downloaded += 1
print(f"已下载 {downloaded}/{NUM_IMAGES}")
except Exception as e:
print(f"下载失败 {img_url}: {str(e)}")
page += 1
except Exception as e:
print(f"请求失败: {str(e)}")
break
if __name__ == "__main__":
download_images()
Loading…
Cancel
Save