You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

58 lines
3.8 KiB

# coding:utf-8
import requests
import time
from lxml import etree
import csv
# import requests
import sys
import io
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf-8')
header = {
"Cookie": "_lxsdk_cuid=17d9260857ac8-0261ae8e8cf9cf-404b032d-15f900-17d9260857ac8; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; uuid_n_v=v1; uuid=781C6E1056F811EC8FA005B59D5174D5DC740A9AC92C4E35961CD1F8E2A30E14; _csrf=8a489d7aa02e782a3d650bb2c6b75d59905f022dc2cf7bfe73cea5aa615cd653; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1638838573; _lxsdk=781C6E1056F811EC8FA005B59D5174D5DC740A9AC92C4E35961CD1F8E2A30E14; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1638845128; __mta=141972365.1638838584122.1638844841395.1638845127805.14; _lxsdk_s=17d928b96b8-f72-49f-714%7C%7C27",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
}
csv_tab=["电影名","评分"]
def get_url():
url_list = []
score_list = []
url = 'https://www.maoyan.com/films'
html = requests.get(url, headers=header).content.decode()
# print(html)
h = etree.HTML(html)
text = h.xpath('//dd')
# print(text)
for i in text:
url1 = i.xpath('./div/a/@href')[0]
url_list.append('https://www.maoyan.com'+url1)
title=i.xpath("./div[@class='channel-detail movie-item-title']/a/text()")[0]
score = i.xpath('./div[3]//text()')
if len(score)>1:
score1=score[0]+score[1]
else:
score1=score[0]
# print(score1)
# print(title)
score_list.append([title,score1])
# print(len(url_list))
# with open('./file/score.csv', 'w', newline='')as f:
# writer = csv.writer(f)
# writer.writerow(csv_tab)
# writer.writerows(score_list)
return url_list
def get_html_info(url_list):
for url in url_list:
header2 = {
"Cookie": "H_WISE_SIDS_BFESS=40210_40320_40079_40364_40351_40366_40376_40398_40445_40466_40472_40317_40513; BDUSS_BFESS=pvcXVPZFpNMGlnLVRqMGdMRXJ4UGxKU2ZDRjZvWEJtM2RSYUhYd3cwczJiMEJtSVFBQUFBJCQAAAAAAAAAAAEAAABAYT~0tMCz5rPmbG92ZQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADbiGGY24hhmMV; BDSFRCVID_BFESS=UJ4OJeC624xDmubtX6v5rpzIEm47KB5TH6f3ojSHmBX7Qt33nYFJEG0Psx8g0Kub-FLJogKKLmOTHpuF_2uxOjjg8UtVJeC6EG0Ptf8g0f5; H_BDCLCKID_SF_BFESS=JnCH_DDhJKt3fP36q4ofK4_WKUnh-I62aJ0f3qbvWJ5TMCoL04jO-4D30fR3-MQ7bN5y2tta5hOKShPCb6-M3qFq-UriJ-LfLaQLhUby3l02VhQIe-t2yU_IXJJXQ4RMW23rWl7mWPJKfJbND55KLntYhpJBKpIf2J6KaJoCbnLWeIJIjjCMD5v-jaLeq6naaK6H0n6VMTrjDCvvW5Ocy4LdjG5QLMPe0anJXCJD3qvVh-K4ylQpMRIg3-Aq54RkJjQjLMcgWxj8SRcaLJjqQfbQ0-5hqP-jW2cua-KXHR7JOpvshfnxyb80QRPH-Rv92DQMVU52QqcqEIQHQT3m5-5bbN3ht6IJfRKf_K-MtKvbfP0k2KcfbJttbUIX5-RLf5QkVp7F5l8-hl8x3qQWKR0RjxRJbJOEWmTfWM7DtDQxOKQphpQaypjbbqj7-ljRt6baKInN3KJmHpC9bT3vLfuwDJ0L2-biW2JL2Mbd-P5P_IoG2Mn8M4bb3qOpBtQmJeTxoUJ25DnJh-PGe6KWj65LDaLOqbO0K5v0QbP8Kbu38fnDXU6qLT5XeH7wbPnNbN-qBhQ--PczsJCmjU4Kjl0njxQybt6KWD6NXfnhLf5FelQH0MonDh8-3H7MJUntKHTuaMoO5hvv8b6O3M725fKmDloOW-TB5bbPLUQF5l8-sq0x0bOte-bQbG_EJjLjfRkf_KtQ-n8_fbTph4OhhR08-UAX5-RLfaTZ_p7F5l8-hCb9Q-ch-R0RjlDtbJOEWmTL_I3c0D5xOKQIDPn80UkDLRAfQR0etKokKqcN3KJmqPP9bT3v5tjDQ4vZ2-biW2JL2Mbd-P5P_IoG2Mn8M4bb3qOpBtQmJeTxoUJ25DnJhbLGe4bK-TryjNADtUK; ZFY=b:BflOtj91mDiN9pZVZa5s8mhkTL0e:BAjKS5DO:A2Hl18:C; BAIDUID_BFESS=6DE054C0AAD46F264BDDF0A3F9303A0D:FG=1",
"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Mobile Safari/537.36"
}
html = requests.get(url, headers=header2).content.decode()
h = etree.HTML(html)
title = h.xpath("//div[@class='movie-cn-name']/h1//text()")
print(title)
fp=open("./file/"+str(title)+".html","w",encoding='utf-8')
fp.write(html)
fp.close()
# print(html)
if __name__ == '__main__':
get_html_info(get_url())