|
|
|
@ -0,0 +1,14 @@
|
|
|
|
|
# 通过正则来对数据进行清洗,也可以使用xpath
|
|
|
|
|
import re,numpy
|
|
|
|
|
def process(resp):
|
|
|
|
|
# 标题
|
|
|
|
|
title = re.findall('<h2 class="popularem-title">(.+?)</h2>', resp)
|
|
|
|
|
# 简介
|
|
|
|
|
content = re.findall('<p class="popularem-abs padshow">(.+?)</p>', resp)
|
|
|
|
|
# 作者
|
|
|
|
|
author = re.findall('<a href="author_\d+?" class="column">(.+?)</a>', resp)
|
|
|
|
|
# 点赞数
|
|
|
|
|
favNum = re.findall('<span class="fav" data-id="\d+?">(.+?)</span>', resp)
|
|
|
|
|
# 使用zip函数打包并转换成列表
|
|
|
|
|
mess = list(zip(title, content, author, favNum))
|
|
|
|
|
return mess
|