This page looks best with JavaScript enabled

QQ-Friends-Word-Cloud

 ·  ☕ 2 min read

例子


实现方法

  • Chrome headless 模式结合 selenium webDriver 实现通过扫描二维码登录 QZone 获取 Cookie

  • 携带登录成功的 cookie 和 token 访问好友空间

  • 遍历好友所有说说, 解析数据存入数据库

  • 使用 jieba 分词和 wordcloud 词云实现词云图

  • 环境要求: ChromeDriver, Chrome version >= 64, Python 2.X

login_qzone.py

参考 http://shenchao.me/2016/04/16/qqzone/

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# coding=utf-8
import re
from time import sleep

from PIL import Image
from selenium import webdriver


def login_by_qr():
    def getGTK(cookie):
        """ 根据cookie得到GTK """
        hashes = 5381
        for letter in cookie['p_skey']:
            hashes += (hashes << 5) + ord(letter)
        return hashes & 0x7fffffff

    option = webdriver.ChromeOptions()
    option.add_argument('--headless')
    option.add_argument('--disable-gpu')
    browser = webdriver.Chrome(executable_path='etc/chromedriver_linux', options=option)
    url = "https://qzone.qq.com/"
    browser.get(url)
    sleep(3)  # 等三秒
    browser.get_screenshot_as_file('QR.png')
    im = Image.open('QR.png')
    im.show()
    print ('wait scanner QR')
    sleep(20)  # 等二十秒,可根据自己的网速和性能修改
    print(''.format(browser.title))
    cookie = {}
    for elem in browser.get_cookies():
        cookie[elem['name']] = elem['value']
        print('Get the cookie of qq login successfully!')
    html = browser.page_source
    # 从网页源码中提取g_qzonetoken
    g_qzonetoken = re.search(r'window\.g_qzonetoken = \(function\(\)\{ try\{return (.*?);\} catch\(e\)', html)
    token = g_qzonetoken.group(1)
    gtk = getGTK(cookie)  # 通过getGTK函数计算gtk
    browser.quit()
    return cookie, gtk, token

fetch_moods.py

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
# coding=utf-8

import csv
import json
import re

import requests

from login_qzone import login_by_qr
from utils.util import clear_text

GET_MOODS_URL = 'https://h5.qzone.qq.com/proxy/domain/taotao.qq.com/cgi-bin/emotion_cgi_msglist_v6'

RESP_REGEX = r'_preloadCallback\((.*?)\}\)'


def read_friends_qq(csv_file_path):
    csv_reader = csv.reader(open(csv_file_path))
    csv_reader.next()
    friend = []
    for row in csv_reader:
        friend.append(row[3])
    friends = []
    for f in friend:
        friends.append(f[:-7])
    return friends


class Fetcher(object):
    def __init__(self):
        self.headers = {
            'Host': 'h5.qzone.qq.com',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0',
            'Accept': '*/*',
            'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
            'Accept-Encoding': 'gzip, deflate, br',
            'Referer': 'https://user.qzone.qq.com/408776303?_t_=0.22746974226377736',
            'Connection': 'keep-alive'
        }
        self.cookie = None
        self.gtk = None
        self.token = None

    def login(self):
        self.cookie, self.gtk, self.token = login_by_qr()

    def fetch(self, qq):
        if not self.cookie or not self.gtk or not self.token:
            print ('Need login!')
            return
        count = 0
        p = 1
        session = requests.session()
        while True:
            pos = p * 20
            params = {
                'uin': qq,
                'ftype': '0',
                'sort': '0',
                'pos': pos,
                'num': '20',
                'replynum': '100',
                'g_tk': self.gtk,
                'callback': '_preloadCallback',
                'code_version': '1',
                'format': 'jsonp',
                'need_private_comment': '1',
                'qzonetoken': self.token
            }
            try:
                response = session.request('GET', GET_MOODS_URL, params=params, headers=self.headers,
                                           cookies=self.cookie)
                print(response.status_code)
                text = response.text
                matcher = re.match(RESP_REGEX, text)
                if matcher:
                    t = matcher.group(1) + '}'
                    data = json.loads(t)
                    if not data:
                        continue
                    msg_list = data.get('msglist')
                    if isinstance(msg_list, list):
                        total = len(msg_list)
                        for item in msg_list:
                            yield self._wrap_mood(qq, item)
                    else:
                        total = 0
                    count += total
                    if total == 0:
                        print(u'{}-total: {}'.format(qq, count))
                        break
            except Exception as e:
                print(e)
            finally:
                p += 1

    @staticmethod
    def _wrap_mood(qq, item):
        mood = {
            'content': clear_text(item.get('content')),
            'created_time': item.get('created_time'),
            'qq': qq,
            'nick_name': clear_text(item.get('name'))
        }
        return mood

generator_cloud.py

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# coding=utf-8

import jieba
import matplotlib.pyplot as plt
from wordcloud import WordCloud


def generator_cloud(name, text):
    """
    :param name: 存放的图片文件名
    :param text: 目标生成词云的文本
    :return:
    """
    # 分词
    word_list_after_jieba = jieba.cut(text, cut_all=True)
    wl_space_split = " ".join(word_list_after_jieba)
    if not wl_space_split:
        return
        # 生成词云
    my_word_cloud = WordCloud(font_path='etc/font.ttf',
                              width=1800,
                              height=800,
                              margin=5).generate(wl_space_split)
    plt.imshow(my_word_cloud)
    plt.axis("off")
    plt.savefig('png/{}.png'.format(name))
Support the author with
alipay QR Code
wechat QR Code

Yang
WRITTEN BY
Yang
Developer