master
p36049127 4 years ago
parent e9f2b37e10
commit ba8cbf0f56

@ -0,0 +1,91 @@
import requests
import re
def get_html(url):
try:
response = requests.get(url,headers=headers)
response.encoding = 'GBK'
response.encoding = 'utf-8'
# response.encoding = 'gbk'
html = response.text
return html
except:
print('请求网址出错')
def parse_url_dog(html):
pattern = re.compile('<li><a href="/vod/(.*?)" target="_blank">狗狗..</a></li>',re.S)
items = re.findall(pattern,html)
for i in range(len(items)):
items[i] = items[i][-5:-1:]
return items
def parse_url_cat(html):
pattern = re.compile('<li><a href="/vod/(.*?)" target="_blank">猫猫..</a></li>',re.S)
items = re.findall(pattern,html)
for i in range(len(items)):
items[i] = items[i][-5:-1:]
return items
def parse_url(html):
pattern1 = re.compile('<li><a href="/vod(\d+)" target="_blank">.*?</i></div>',re.S)
items1 = re.findall(pattern1,html)
pattern2 = re.compile('<title>(.*?)-我的宠舍</title>',re.S)
items2 = re.findall(pattern2,html)
return [items2,items1]
def parse_result(html):
pattern = re.compile('<title>(.*?)-我的宠舍</title>.*?<p>(.*?)</p>',re.S)
items = re.findall(pattern,html)
return items
def write_item_to_file(item,name):
with open('问答\\' + str(name)+'知识.txt', 'a', encoding='ANSI') as f:
f.write('问:' + item[0] + '')
f.write('答:' + item[1] + '')
f.close()
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',}
url_main = 'https://www.chongshe.cn/'
html_main = get_html(url_main)
dog_urls = parse_url_dog(html_main) #['vdys', 'vdjk', 'vdxl', 'vdsh', 'vdfy']
cat_urls = parse_url_cat(html_main)
urls = []
for dog_url in dog_urls:
tmp = 'https://www.chongshe.cn/vod/' + str(dog_url)
html_dog = get_html(tmp)
urls = urls + parse_url(html_dog)
for cat_url in cat_urls:
tmp = 'https://www.chongshe.cn/vod/' + str(cat_url)
html_cat = get_html(tmp)
urls = urls + parse_url(html_cat)
for i in range(len(urls)):
if i%2==1:
for url in urls[i]:
tmp = 'https://www.chongshe.cn/vod' + str(url)
html = get_html(tmp)
items = parse_result(html)
for j in range(len(items)):
write_item_to_file(eval(str(items[j])),urls[i-1][0])
#添加进目录
'''
with open('catalogue.txt','r', encoding='ANSI') as f:
line_data = ''
for line in f:
line_data += line
if line == '狗狗\n':
line_data += '\t常见问题\n'
for i in range(0,10,2):
line_data += ('\t\t' + urls[i][0] + '\n')
if line == '猫猫\n':
line_data += '\t常见问题\n'
for i in range(10,len(urls),2):
line_data += ('\t\t' + urls[i][0] + '\n')
f.close()
with open('catalogue.txt','w', encoding='ANSI') as f:
f.write(line_data)
f.close()'''
Loading…
Cancel
Save