You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
62 lines
2.1 KiB
62 lines
2.1 KiB
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
def fetch_job_info1(url, output_file1):
|
|
head = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0'
|
|
}
|
|
r = requests.get(url, headers=head)
|
|
r.encoding = 'utf-8'
|
|
mysoup = BeautifulSoup(r.text, 'lxml')
|
|
result = mysoup.select("p")
|
|
job_info = ""
|
|
for t in result:
|
|
job_info += t.get_text(strip=True) + "\n"
|
|
# 将信息写入到文件中
|
|
with open(output_file1, 'a', encoding='utf-8') as f:
|
|
f.write(job_info)
|
|
|
|
def fetch_job_info2(url1):
|
|
head = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0'
|
|
}
|
|
r = requests.get(url1, headers=head)
|
|
r.encoding = 'utf-8'
|
|
mysoup = BeautifulSoup(r.text, 'lxml')
|
|
result = mysoup.select("p.name")
|
|
for t in result:
|
|
job_info = t.get_text()
|
|
|
|
return (job_info)
|
|
|
|
base_url = "http://www.jvrmusic.com"
|
|
url = "http://www.jvrmusic.com/artist"
|
|
output_file1 = "artist.txt"
|
|
output_file2 = "artist_info.txt"
|
|
|
|
head = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0'
|
|
}
|
|
r = requests.get(url, headers=head)
|
|
r.encoding = 'utf-8'
|
|
mysoup = BeautifulSoup(r.text, 'lxml')
|
|
artist_links = mysoup.select("div.item a")
|
|
|
|
for t in artist_links:
|
|
ttext = t.get_text()
|
|
tattr = t.get("href")
|
|
url1 = base_url + tattr
|
|
test = fetch_job_info2(url1)
|
|
fetch_job_info1(url1, output_file1)
|
|
r = requests.get(url1, headers=head)
|
|
mysoup2 = BeautifulSoup(r.text, 'lxml')
|
|
artist_links2 = mysoup2.select("div.sub-menu a")
|
|
info = fetch_job_info2(url1) + "\n"
|
|
for t in artist_links2:
|
|
ttext = t.get_text()
|
|
tattr = t.get("href")
|
|
url2 = base_url + tattr +"\n"
|
|
info += url2
|
|
with open(output_file2, 'a', encoding='utf-8') as f:
|
|
f.write(info) |