diff --git a/washer.py b/washer.py new file mode 100644 index 0000000..022b4d4 --- /dev/null +++ b/washer.py @@ -0,0 +1,14 @@ +# 通过正则来对数据进行清洗,也可以使用xpath +import re,numpy +def process(resp): +# 标题 + title = re.findall('

(.+?)

', resp) +# 简介 + content = re.findall('

(.+?)

', resp) +# 作者 + author = re.findall('(.+?)', resp) +# 点赞数 + favNum = re.findall('(.+?)', resp) +# 使用zip函数打包并转换成列表 + mess = list(zip(title, content, author, favNum)) + return mess \ No newline at end of file