diff --git a/main.py b/main.py new file mode 100644 index 0000000..5f3c7bc --- /dev/null +++ b/main.py @@ -0,0 +1,56 @@ +import requests +import os +from urllib.parse import quote +from pathlib import Path + +# 配置参数 +KEYWORD = "工程师" # 搜索关键词 +SAVE_DIR = "/root/imgs4" # 保存路径(需确保有写入权限) +NUM_IMAGES = 20 # 目标数量 +HEADERS = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36", + "Referer": "https://image.baidu.com/" +} + +def download_images(): + # 创建保存目录 + Path(SAVE_DIR).mkdir(parents=True, exist_ok=True) + + # 构造请求URL(编码关键词) + encoded_keyword = quote(KEYWORD) + url = f"https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&word={encoded_keyword}" + + # 分页请求(自动计算所需页数) + downloaded = 0 + page = 0 + while downloaded < NUM_IMAGES: + params = { + "pn": page * 30, # 百度默认每页30条 + "rn": 30 + } + try: + response = requests.get(url, headers=HEADERS, params=params, timeout=10) + response.raise_for_status() + data = response.json() + + # 提取图片真实URL + for item in data.get("data", []): + if downloaded >= NUM_IMAGES: + break + if "thumbURL" in item: + img_url = item["thumbURL"] + try: + img_data = requests.get(img_url, headers=HEADERS, timeout=5).content + with open(os.path.join(SAVE_DIR, f"engineer_{downloaded+1}.jpg"), "wb") as f: + f.write(img_data) + downloaded += 1 + print(f"已下载 {downloaded}/{NUM_IMAGES}") + except Exception as e: + print(f"下载失败 {img_url}: {str(e)}") + page += 1 + except Exception as e: + print(f"请求失败: {str(e)}") + break + +if __name__ == "__main__": + download_images() \ No newline at end of file