python 爬取微信公众号文章

2019-07-03 12:05:48

Handle类

import requests, os, sys, time, re, json
from selenium import webdriver
from bs4 import BeautifulSoup
import html5lib
from string import Template
from collections import defaultdict, namedtuple
from urllib.request import urlretrieve
import hashlib

class Handle(object):

    __slots__ = ['url', 'html', 'bs', 'content', 'data', 'save_path']

    def __init__(self, url, save_path='./remarks/articles/data/'):
        self.url = url
        self.data = {'title': '', 'cover': '', 'author': '', 'content': '', 'created_at': '', 'desc': '', 'view': ''}
        self.save_path = save_path
        pass

    def do(self):
        #   请求
        self.request()
        #   处理html
        self.parser_html()
        #
        print(self.data['title'] + ': over!')
        pass

    def request(self):
        browser = webdriver.Chrome(executable_path=os.path.abspath("chromedriver"))
        browser.get(self.url)
        browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
        time.sleep(2)
        self.html = browser.page_source
        browser.close()
        pass

    def parser_html(self):
        # path = './remarks/articles/MUJI x MAU | 无印良品首家产学共创店/index.html'
        # self.bs = BeautifulSoup(open(path, 'r'), 'html5lib')

        self.bs = BeautifulSoup(self.html, 'html5lib')
        # title
        self.data['title'] = self.bs.find('h2', id='activity-name').get_text('', strip=True)
        self.save_path = self.save_path + self.data['title']
        content = self.bs.find('div', id='js_content')
        # view
        self.data['view'] = self.bs.find('span', id='readNum3').get_text('', strip=True)
        # TODO 删除底部
        a = self.bs.find('p', text=re.compile('ABOUT US'))
        if a:
            a.parent.parent.parent.parent.extract()
            pass
        # TODO 图片路径
        for img in content.find_all('img'):
            src = img.attrs['data-src']
            img.attrs['src'] = self.download_img(src)
            # 删除属性
            del img['crossorigin']
            pass
        # TODO content
        self.data['content'] = content.prettify()
        # 记录html
        self.write_html()
        # 记录data
        self.write_data()
        pass

    def write_html(self):
        os.path.isdir(self.save_path) or os.makedirs(self.save_path)
        with open(self.save_path + '/content.html', 'w') as h:
            h.write(str(self.data['content']))
            pass
        with open(self.save_path + '/index.html', 'w') as h:
            h.write(str(self.html))
            pass
        pass

    def write_data(self):
        with open(self.save_path + '/data.json', 'w') as d:
            json_data = json.dumps(self.data, indent=4, ensure_ascii=False)
            d.write(json_data)
            pass
        pass
        pass

    def download_img(self, url):
        # 获取扩展
        if len(url.split('wx_fmt=')) == 2:
            extension = url.split('wx_fmt=')[1]
        else:
            extension = 'jpeg'
            pass
        # 文件名
        filename = hashlib.md5(url.encode('utf8')).hexdigest() + '.' + extension
        filepath = './remarks/articles/imgs/' + filename
        os.path.isfile(filepath) or urlretrieve(url, filepath)
        return '__IMAGE_PATH__/' + filename
        pass
    pass

应用

from plugins.Handle import Handle

def main():
    urls = fetch_url_list()
    for url in urls:
        handle = Handle(url=url)
        handle.do()
        pass
    pass

def fetch_url_list():
        """ 
        url.txt 内容
        https://mp.weixin.qq.com/s/dh810UDeHMvUC45MVa6iUQ
        https://mp.weixin.qq.com/s/4xhzaePwiK0dhNJ3FpjKuw
        https://mp.weixin.qq.com/s/fCrCTmDvF69ferHH4OIBWA
        https://mp.weixin.qq.com/s/NdSC_GeUj0MABl5H28nMRg
        https://mp.weixin.qq.com/s/QT-brVK2ay_I2FmMGE2jpg
        ... ...
        """
    path = './remarks/url.txt'
    with open(path, 'r') as u:
        return [url[:-1] for url in u.readlines() if len(url) > 10]
    pass

if __name__ == '__main__':
    main()