You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
58 lines
3.8 KiB
58 lines
3.8 KiB
# coding:utf-8
|
|
import requests
|
|
import time
|
|
from lxml import etree
|
|
import csv
|
|
# import requests
|
|
import sys
|
|
import io
|
|
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf-8')
|
|
|
|
header = {
|
|
"Cookie": "_lxsdk_cuid=17d9260857ac8-0261ae8e8cf9cf-404b032d-15f900-17d9260857ac8; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; uuid_n_v=v1; uuid=781C6E1056F811EC8FA005B59D5174D5DC740A9AC92C4E35961CD1F8E2A30E14; _csrf=8a489d7aa02e782a3d650bb2c6b75d59905f022dc2cf7bfe73cea5aa615cd653; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1638838573; _lxsdk=781C6E1056F811EC8FA005B59D5174D5DC740A9AC92C4E35961CD1F8E2A30E14; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1638845128; __mta=141972365.1638838584122.1638844841395.1638845127805.14; _lxsdk_s=17d928b96b8-f72-49f-714%7C%7C27",
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
|
|
}
|
|
csv_tab=["电影名","评分"]
|
|
def get_url():
|
|
url_list = []
|
|
score_list = []
|
|
url = 'https://www.maoyan.com/films'
|
|
html = requests.get(url, headers=header).content.decode()
|
|
# print(html)
|
|
h = etree.HTML(html)
|
|
text = h.xpath('//dd')
|
|
# print(text)
|
|
for i in text:
|
|
url1 = i.xpath('./div/a/@href')[0]
|
|
url_list.append('https://www.maoyan.com'+url1)
|
|
title=i.xpath("./div[@class='channel-detail movie-item-title']/a/text()")[0]
|
|
score = i.xpath('./div[3]//text()')
|
|
if len(score)>1:
|
|
score1=score[0]+score[1]
|
|
else:
|
|
score1=score[0]
|
|
# print(score1)
|
|
# print(title)
|
|
score_list.append([title,score1])
|
|
# print(len(url_list))
|
|
# with open('./file/score.csv', 'w', newline='')as f:
|
|
# writer = csv.writer(f)
|
|
# writer.writerow(csv_tab)
|
|
# writer.writerows(score_list)
|
|
return url_list
|
|
def get_html_info(url_list):
|
|
for url in url_list:
|
|
header2 = {
|
|
"Cookie": "H_WISE_SIDS_BFESS=40210_40320_40079_40364_40351_40366_40376_40398_40445_40466_40472_40317_40513; BDUSS_BFESS=pvcXVPZFpNMGlnLVRqMGdMRXJ4UGxKU2ZDRjZvWEJtM2RSYUhYd3cwczJiMEJtSVFBQUFBJCQAAAAAAAAAAAEAAABAYT~0tMCz5rPmbG92ZQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADbiGGY24hhmMV; BDSFRCVID_BFESS=UJ4OJeC624xDmubtX6v5rpzIEm47KB5TH6f3ojSHmBX7Qt33nYFJEG0Psx8g0Kub-FLJogKKLmOTHpuF_2uxOjjg8UtVJeC6EG0Ptf8g0f5; H_BDCLCKID_SF_BFESS=JnCH_DDhJKt3fP36q4ofK4_WKUnh-I62aJ0f3qbvWJ5TMCoL04jO-4D30fR3-MQ7bN5y2tta5hOKShPCb6-M3qFq-UriJ-LfLaQLhUby3l02VhQIe-t2yU_IXJJXQ4RMW23rWl7mWPJKfJbND55KLntYhpJBKpIf2J6KaJoCbnLWeIJIjjCMD5v-jaLeq6naaK6H0n6VMTrjDCvvW5Ocy4LdjG5QLMPe0anJXCJD3qvVh-K4ylQpMRIg3-Aq54RkJjQjLMcgWxj8SRcaLJjqQfbQ0-5hqP-jW2cua-KXHR7JOpvshfnxyb80QRPH-Rv92DQMVU52QqcqEIQHQT3m5-5bbN3ht6IJfRKf_K-MtKvbfP0k2KcfbJttbUIX5-RLf5QkVp7F5l8-hl8x3qQWKR0RjxRJbJOEWmTfWM7DtDQxOKQphpQaypjbbqj7-ljRt6baKInN3KJmHpC9bT3vLfuwDJ0L2-biW2JL2Mbd-P5P_IoG2Mn8M4bb3qOpBtQmJeTxoUJ25DnJh-PGe6KWj65LDaLOqbO0K5v0QbP8Kbu38fnDXU6qLT5XeH7wbPnNbN-qBhQ--PczsJCmjU4Kjl0njxQybt6KWD6NXfnhLf5FelQH0MonDh8-3H7MJUntKHTuaMoO5hvv8b6O3M725fKmDloOW-TB5bbPLUQF5l8-sq0x0bOte-bQbG_EJjLjfRkf_KtQ-n8_fbTph4OhhR08-UAX5-RLfaTZ_p7F5l8-hCb9Q-ch-R0RjlDtbJOEWmTL_I3c0D5xOKQIDPn80UkDLRAfQR0etKokKqcN3KJmqPP9bT3v5tjDQ4vZ2-biW2JL2Mbd-P5P_IoG2Mn8M4bb3qOpBtQmJeTxoUJ25DnJhbLGe4bK-TryjNADtUK; ZFY=b:BflOtj91mDiN9pZVZa5s8mhkTL0e:BAjKS5DO:A2Hl18:C; BAIDUID_BFESS=6DE054C0AAD46F264BDDF0A3F9303A0D:FG=1",
|
|
"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Mobile Safari/537.36"
|
|
}
|
|
html = requests.get(url, headers=header2).content.decode()
|
|
h = etree.HTML(html)
|
|
title = h.xpath("//div[@class='movie-cn-name']/h1//text()")
|
|
print(title)
|
|
fp=open("./file/"+str(title)+".html","w",encoding='utf-8')
|
|
fp.write(html)
|
|
fp.close()
|
|
# print(html)
|
|
if __name__ == '__main__':
|
|
get_html_info(get_url()) |