流浪地球影评+词云python

爬去猫眼评论

1、用手机模式，查看评论，找到json链接，并进行分析
http://m.maoyan.com/review/v2/comments.json?movieId=248906&userId=-1&offset=45&limit=15&ts=1549764694911&type=3

offset 表示当前加载位置：从0开始
limit 每次加载15条
ts 当前加载的unix时间

import time
import datetime
import requests
import json
import pandas as pd

headers ={'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1',
          'Connection': 'keep-alive'}
cookies = {'Cookie':'_lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; _lxsdk_cuid=168d514e681c8-021028248d570e8-4c312e7e-e1000-168d514e682c8; _lxsdk_s=168d514e682-8e5-057-19e%7C%7C99; _lxsdk=8EC724B02CD611E9825429306E395B52452326EC7C5F4B70A0E6A79F33450C9E; __mta=119831223.1549763143337.1549764335344.1549764338641.4; uuid_n_v=v1; iuuid=8EC724B02CD611E9825429306E395B52452326EC7C5F4B70A0E6A79F33450C9E; ci=129%2C%E5%A4%A7%E5%90%8C'}
url = 'http://m.maoyan.com/review/v2/comments.json?movieId=248906&userId=-1&offset=0&limit=15&ts={}&type=3'
#爬去字段
comment=[]
nick =[]
gender=[]
score=[]
comment_time = []
userLevel=[]#用户等级
userId=[]#用户id
upCount=[]#点赞数
replyCount=[]#评论数
ji = 1

#当前时间unix戳  ms
url_time = int(time.time())*1000
for i in range(1000):
    value = 15*i
    url_range=url.format(url_time)#字符串格式化{传参}
    res=requests.get(url_range,headers=headers,cookies=cookies,timeout=10)#url,header,cookies,timeout
    res.encoding='utf-8'
    print('正在爬去第'+str(ji)+'页')
    content = json.loads(res.text,encoding='utf-8')
    list_=content['data']['comments']
    count=0
    for item in list_:
        comment.append(item['content'])
        nick.append(item['nick'])
        score.append(item['score'])
        comment_time.append(datetime.datetime.fromtimestamp(int(item['startTime'])/1000))
        gender.append(item['gender'])
        userId.append(item['userId'])
        userLevel.append(item['userLevel'])
        replyCount.append(item['replyCount'])
        upCount.append(item['upCount'])
        count=count+1
        if count==15:
            url_time=item['startTime']
    ji+=1

print("爬取完成")
result={'用户id':userId,'用户昵称':nick,'用户等级':userLevel,'性别':gender,'时间':comment_time,'评分':score,'评论内容':comment,'点赞':upCount,'评论':replyCount}
results= pd.DataFrame(result)
results.info()
results.to_csv("d:\流浪地球.csv")

词云

import pandas as pd
import wordcloud
import matplotlib.pyplot as plt
import jieba
from collections import Counter
import numpy as np
#jieba.load_userdict("new.txt") #新定义词典
df = pd.read_csv('d:\流浪地球.csv')

comments=str()
for comment in df['评论内容']:
    comments=comments+comment

stopwords = {}.fromkeys([ line.rstrip() for line in open('stopwords.txt','r',encoding='utf-8') ])
segs = jieba.cut(comments,cut_all=False)

cloud_text =[]
for seg in segs:

   if seg not in stopwords:
        cloud_text.append(seg)


fre = Counter(cloud_text)

wc = wordcloud.WordCloud(
    font_path="C:\\Windows\\Fonts\\STFANGSO.ttf",
    max_words=150,
    max_font_size=250,
    width=1000,
    height=860
)

wc.generate_from_frequencies(fre)
plt.imshow(wc)
plt.axis('off')
plt.show()
wc.to_file('d:\流浪地球_词云.png')