|
|
|
@ -0,0 +1,91 @@
|
|
|
|
|
import requests
|
|
|
|
|
import re
|
|
|
|
|
|
|
|
|
|
def get_html(url):
|
|
|
|
|
try:
|
|
|
|
|
response = requests.get(url,headers=headers)
|
|
|
|
|
response.encoding = 'GBK'
|
|
|
|
|
response.encoding = 'utf-8'
|
|
|
|
|
# response.encoding = 'gbk'
|
|
|
|
|
html = response.text
|
|
|
|
|
return html
|
|
|
|
|
except:
|
|
|
|
|
print('请求网址出错')
|
|
|
|
|
|
|
|
|
|
def parse_url_dog(html):
|
|
|
|
|
pattern = re.compile('<li><a href="/vod/(.*?)" target="_blank">狗狗..</a></li>',re.S)
|
|
|
|
|
items = re.findall(pattern,html)
|
|
|
|
|
for i in range(len(items)):
|
|
|
|
|
items[i] = items[i][-5:-1:]
|
|
|
|
|
return items
|
|
|
|
|
|
|
|
|
|
def parse_url_cat(html):
|
|
|
|
|
pattern = re.compile('<li><a href="/vod/(.*?)" target="_blank">猫猫..</a></li>',re.S)
|
|
|
|
|
items = re.findall(pattern,html)
|
|
|
|
|
for i in range(len(items)):
|
|
|
|
|
items[i] = items[i][-5:-1:]
|
|
|
|
|
return items
|
|
|
|
|
|
|
|
|
|
def parse_url(html):
|
|
|
|
|
pattern1 = re.compile('<li><a href="/vod(\d+)" target="_blank">.*?</i></div>',re.S)
|
|
|
|
|
items1 = re.findall(pattern1,html)
|
|
|
|
|
pattern2 = re.compile('<title>(.*?)-我的宠舍</title>',re.S)
|
|
|
|
|
items2 = re.findall(pattern2,html)
|
|
|
|
|
return [items2,items1]
|
|
|
|
|
|
|
|
|
|
def parse_result(html):
|
|
|
|
|
pattern = re.compile('<title>(.*?)-我的宠舍</title>.*?<p>(.*?)</p>',re.S)
|
|
|
|
|
items = re.findall(pattern,html)
|
|
|
|
|
return items
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def write_item_to_file(item,name):
|
|
|
|
|
with open('问答\\' + str(name)+'知识.txt', 'a', encoding='ANSI') as f:
|
|
|
|
|
f.write('问:' + item[0] + '?')
|
|
|
|
|
f.write('答:' + item[1] + '')
|
|
|
|
|
f.close()
|
|
|
|
|
|
|
|
|
|
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',}
|
|
|
|
|
url_main = 'https://www.chongshe.cn/'
|
|
|
|
|
html_main = get_html(url_main)
|
|
|
|
|
dog_urls = parse_url_dog(html_main) #['vdys', 'vdjk', 'vdxl', 'vdsh', 'vdfy']
|
|
|
|
|
cat_urls = parse_url_cat(html_main)
|
|
|
|
|
|
|
|
|
|
urls = []
|
|
|
|
|
for dog_url in dog_urls:
|
|
|
|
|
tmp = 'https://www.chongshe.cn/vod/' + str(dog_url)
|
|
|
|
|
html_dog = get_html(tmp)
|
|
|
|
|
urls = urls + parse_url(html_dog)
|
|
|
|
|
for cat_url in cat_urls:
|
|
|
|
|
tmp = 'https://www.chongshe.cn/vod/' + str(cat_url)
|
|
|
|
|
html_cat = get_html(tmp)
|
|
|
|
|
urls = urls + parse_url(html_cat)
|
|
|
|
|
|
|
|
|
|
for i in range(len(urls)):
|
|
|
|
|
if i%2==1:
|
|
|
|
|
for url in urls[i]:
|
|
|
|
|
tmp = 'https://www.chongshe.cn/vod' + str(url)
|
|
|
|
|
html = get_html(tmp)
|
|
|
|
|
items = parse_result(html)
|
|
|
|
|
for j in range(len(items)):
|
|
|
|
|
write_item_to_file(eval(str(items[j])),urls[i-1][0])
|
|
|
|
|
|
|
|
|
|
#添加进目录
|
|
|
|
|
'''
|
|
|
|
|
with open('catalogue.txt','r', encoding='ANSI') as f:
|
|
|
|
|
line_data = ''
|
|
|
|
|
for line in f:
|
|
|
|
|
line_data += line
|
|
|
|
|
if line == '狗狗\n':
|
|
|
|
|
line_data += '\t常见问题\n'
|
|
|
|
|
for i in range(0,10,2):
|
|
|
|
|
line_data += ('\t\t' + urls[i][0] + '\n')
|
|
|
|
|
if line == '猫猫\n':
|
|
|
|
|
line_data += '\t常见问题\n'
|
|
|
|
|
for i in range(10,len(urls),2):
|
|
|
|
|
line_data += ('\t\t' + urls[i][0] + '\n')
|
|
|
|
|
f.close()
|
|
|
|
|
|
|
|
|
|
with open('catalogue.txt','w', encoding='ANSI') as f:
|
|
|
|
|
f.write(line_data)
|
|
|
|
|
f.close()'''
|