You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
71 lines
2.1 KiB
71 lines
2.1 KiB
'''
|
|
html parser to extract data
|
|
'''
|
|
import re
|
|
from collections import namedtuple
|
|
from requests_html import HTML
|
|
from aspider.routeing import get_router
|
|
router = get_router()
|
|
|
|
|
|
Tag = namedtuple('Tag', ['type', 'value', 'link'])
|
|
|
|
|
|
def parse_item(text):
|
|
'''
|
|
Args:
|
|
text : str - html text
|
|
|
|
Returns:
|
|
tuple: (dict, list)
|
|
dict - meta data for this item
|
|
list - tags for this item
|
|
'''
|
|
html = HTML(html=text)
|
|
title_css = 'body > div.container > h3'
|
|
title = html.find(title_css)[0].text
|
|
cover_img_css = 'body > div.container > div.row.movie > div.col-md-9.screencap > a'
|
|
cover_img_url = html.find(cover_img_css)[0].attrs['href']
|
|
tags_css = 'body > div.container > div.row.movie > div.col-md-3.info'
|
|
tags = html.find(tags_css)[0].find('p')
|
|
release_date = tags[1].text
|
|
length = tags[2].text
|
|
# meta data
|
|
meta = {}
|
|
meta['fanhao'], meta['title'] = title.split(maxsplit=1)
|
|
meta['cover_img_url'] = cover_img_url
|
|
meta['release_date'] = release_date.split()[1]
|
|
meta['length'] = re.search(r'\d+', length).group()
|
|
|
|
tag_list = []
|
|
for tag in tags[3:]:
|
|
tag_type = ''
|
|
tag_value = ''
|
|
tag_link = ''
|
|
links = tag.find('a')
|
|
spans = tag.find('span.header')
|
|
if spans and len(links) == 1:
|
|
tag_type = (spans[0].text)
|
|
tag_link = links[0].attrs['href']
|
|
tag_value = links[0].text
|
|
if tag_type != '' and tag_value != '':
|
|
tag_list.append(create_tag(tag_type, tag_value, tag_link))
|
|
else:
|
|
for link in links:
|
|
tag_link = link.attrs['href']
|
|
tag_value = link.text
|
|
if 'genre' in tag_link:
|
|
tag_type = 'genre'
|
|
if 'star' in tag_link:
|
|
tag_type = 'star'
|
|
if tag_type != '' and tag_value != '':
|
|
tag_list.append(create_tag(tag_type, tag_value, tag_link))
|
|
|
|
return meta, tag_list
|
|
|
|
|
|
def create_tag(tag_type, tag_value, tag_link):
|
|
tag_link = router.get_url_path(tag_link)
|
|
tag = Tag(tag_type, tag_value, tag_link)
|
|
return tag
|