You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
|
|
|
|
# 通过正则来对数据进行清洗,也可以使用xpath
|
|
|
|
|
import re,numpy
|
|
|
|
|
def process(resp):
|
|
|
|
|
# 标题
|
|
|
|
|
title = re.findall('<h2 class="popularem-title">(.+?)</h2>', resp)
|
|
|
|
|
# 简介
|
|
|
|
|
content = re.findall('<p class="popularem-abs padshow">(.+?)</p>', resp)
|
|
|
|
|
# 作者
|
|
|
|
|
author = re.findall('<a href="author_\d+?" class="column">(.+?)</a>', resp)
|
|
|
|
|
# 点赞数
|
|
|
|
|
favNum = re.findall('<span class="fav" data-id="\d+?">(.+?)</span>', resp)
|
|
|
|
|
# 使用zip函数打包并转换成列表
|
|
|
|
|
mess = list(zip(title, content, author, favNum))
|
|
|
|
|
return mess
|