ADD file via upload

main
pskvf7y6b 7 months ago
parent 87a8461e74
commit fce5ca5ade

@ -0,0 +1,73 @@
import requests
from bs4 import BeautifulSoup
import time
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"
}
def get_info(url):
wb_data = requests.get(url, headers=headers)
soup = BeautifulSoup(wb_data.text, 'html.parser')
ranks = soup.select('span.pc_temp_num')
titles = soup.select('div.pc_temp_songlist > ul > li > a')
times = soup.select('span.pc_temp_tips_r > span')
for rank, title, time in zip(ranks, titles, times):
data = {
'rank': rank.get_text().strip(),
'singer': title.get_text().split('-')[0].strip(),
'song': title.get_text().split('-')[1].strip() if len(title.get_text().split('-')) > 1 else '',
'time': time.get_text().strip()
}
print(data)
if __name__ == '__main__':
urls = ['http://www.kugou.com/yy/rank/home/{}-8888.html'.format(str(i)) for i in range(1, 24)]
for url in urls:
get_info(url)
time.sleep(1)
其中items.py文件中删出name前的#
setting.py删除注释65-67
meiju.py
import scrapy
from movie.items import MovieItem
class MeijuSpider(scrapy.Spider):
name = "meiju"
allowed_domains = ["meijutt.com"]
start_urls = ['https://www.meijutt.tv/topiclist/2024xinjutop.html']
def parse(self, response):
movies = response.xpath('//div[@class="topic_box"]/ul')
for each_movie in movies:
item = MovieItem()
item['name'] = each_movie.xpath('./li[@class="font_14"]/a/@title').extract()[0]
yield item
pipelines.py
import json
class MoviePipeline(object):
# def process_item(self, item, spider):
# return item
def open_spider(self, spider):
self.file = open('log.txt', 'w', encoding='utf-8')
def close_spider(self, spider):
self.file.close()
def process_item(self, item, spider):
self.file.write(str(item) + '\n')
import pandas as pd
# 指定Excel文件路径
excel_file_path = 'path_to_your_excel_file.xlsx'
# 使用Pandas读取Excel文件
# 这里假设Excel文件中只有一个sheet如果有多个sheet可以通过sheet_name参数指定
df = pd.read_excel(excel_file_path)
# 打印DataFrame的内容
print(df)
Loading…
Cancel
Save