【技术流】2017年有哪些值得看的电影(豆瓣8.0以上)
马上就要放寒假啦!!
是时候进行一波电影补档!!
但是,什么样的电影才叫好电影??
我们做技术的,就是要用数据说话!
下面分享从豆瓣上爬虫下来的:2017豆瓣8.0评分及以上电影分享!
分享一下大佬的源码:
# -*- coding: utf-8 -*-
import time
import requests
from bs4 import BeautifulSoup
import json
import time
import os
import multiprocessing
class movie_spider(object):
#将电影的分类和对应的Tag建立了一个字典类型
m_movie_class = {'Chinese':'%E5%8D%8E%E8%AF%AD','EuropeandAmerica':'%E6%AC%A7%E7%BE%8E','Korea':'%E9%9F%A9%E5%9B%BD',
'Janpan':'%E6%97%A5%E6%9C%AC','Action':'%E5%8A%A8%E4%BD%9C','Comedy':'%E5%96%9C%E5%89%A7',
'Love':'%E7%88%B1%E6%83%85','ScienceFiction':'%E7%A7%91%E5%B9%BB','Suspense':'%E6%82%AC%E7%96%91'}
def __init__(self,movie_class_name,movie_class_content,iter_number = 100,local_path=''):
self.movie_list = list()
self.movie_class_name = movie_class_name
self.movie_class_content = movie_class_content
self.iter_number = iter_number #查询电影的个数
self.local_path = local_path
def get_movie_id(self):
for start in range(0, self.iter_number, 20):
url = 'https://movie.douban.com/j/search_subjects?type=movie&tag=' + str(
self.movie_class_content) + '&sort=recommend&page_limit=20&page_start=' + str(start)
time.sleep(0.5)
source_code = requests.get(url)
soup = BeautifulSoup(source_code.text, 'lxml')
j = json.loads(soup.text)
subjects = j['subjects']
for item in subjects:
self.get_movie_infor(item['id'])
def get_movie_infor(self,id):
url = 'https://movie.douban.com/j/subject_abstract?subject_id=' + str(id);
source_code = requests.get(url)
soup = BeautifulSoup(source_code.text, 'lxml')
j = json.loads(soup.text)
subjects = j['subject']
self.get_movie_content(id, subjects)
def get_movie_content(self,id,item):
if (item['release_year'] == '2017'):
movie = list()
movie.append(id)
movie.append(item['title'])
movie.append(float(item['rate']))
movie.append(item['actors'])
movie.append(item['region'])
movie.append(item['duration'])
#movie.append(str(item['short_comment']['content'])) #评论暂时有问题
self.movie_list.append(movie)
def save_movie_content(self,movies, num =10, rate=8.0):
folder_path = str(self.local_path)+'/'+'douban_'+str(self.movie_class_name)+'.txt'
with open(folder_path, 'w', encoding='utf-8', errors='ignore') as f:
n = 0
while n < num:
if (movies[n][2] >= rate):
f.write(str(n))
f.write('\t')
for item in movies[n]:
f.write(str(item))
f.write('\t')
f.write("\n")
n = n + 1
else:
break
def print_movie_content(self,movies):
for movie in movies:
print(str(movie) + ' ', end='')
print(end='\n')
def start(self,):
print ("Process Start: "+str(self.movie_class_name))
self.get_movie_id()
init_movies = sorted(self.movie_list, key=lambda movie: movie[2], reverse=True)
self.save_movie_content(init_movies,10,8.0) #打印出10部超过8.0评分的电影,如果没有10部,只打印出来有的
def main():
#使用的是多线程
local_time = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())
os.mkdir(local_time)
for movie_class_name in movie_spider.m_movie_class:
ms = movie_spider(movie_class_name,movie_spider.m_movie_class[movie_class_name],300,local_time) #300为查询电影的个数
p = multiprocessing.Process(target=ms.start)
p.start()
if __name__ == '__main__':
main()
查看19道真题和解析