【技术流】2017年有哪些值得看的电影(豆瓣8.0以上)
马上就要放寒假啦!!
是时候进行一波电影补档!!
但是,什么样的电影才叫好电影??
我们做技术的,就是要用数据说话!
下面分享从豆瓣上爬虫下来的:2017豆瓣8.0评分及以上电影分享!
分享一下大佬的源码:
# -*- coding: utf-8 -*- import time import requests from bs4 import BeautifulSoup import json import time import os import multiprocessing class movie_spider(object): #将电影的分类和对应的Tag建立了一个字典类型 m_movie_class = {'Chinese':'%E5%8D%8E%E8%AF%AD','EuropeandAmerica':'%E6%AC%A7%E7%BE%8E','Korea':'%E9%9F%A9%E5%9B%BD', 'Janpan':'%E6%97%A5%E6%9C%AC','Action':'%E5%8A%A8%E4%BD%9C','Comedy':'%E5%96%9C%E5%89%A7', 'Love':'%E7%88%B1%E6%83%85','ScienceFiction':'%E7%A7%91%E5%B9%BB','Suspense':'%E6%82%AC%E7%96%91'} def __init__(self,movie_class_name,movie_class_content,iter_number = 100,local_path=''): self.movie_list = list() self.movie_class_name = movie_class_name self.movie_class_content = movie_class_content self.iter_number = iter_number #查询电影的个数 self.local_path = local_path def get_movie_id(self): for start in range(0, self.iter_number, 20): url = 'https://movie.douban.com/j/search_subjects?type=movie&tag=' + str( self.movie_class_content) + '&sort=recommend&page_limit=20&page_start=' + str(start) time.sleep(0.5) source_code = requests.get(url) soup = BeautifulSoup(source_code.text, 'lxml') j = json.loads(soup.text) subjects = j['subjects'] for item in subjects: self.get_movie_infor(item['id']) def get_movie_infor(self,id): url = 'https://movie.douban.com/j/subject_abstract?subject_id=' + str(id); source_code = requests.get(url) soup = BeautifulSoup(source_code.text, 'lxml') j = json.loads(soup.text) subjects = j['subject'] self.get_movie_content(id, subjects) def get_movie_content(self,id,item): if (item['release_year'] == '2017'): movie = list() movie.append(id) movie.append(item['title']) movie.append(float(item['rate'])) movie.append(item['actors']) movie.append(item['region']) movie.append(item['duration']) #movie.append(str(item['short_comment']['content'])) #评论暂时有问题 self.movie_list.append(movie) def save_movie_content(self,movies, num =10, rate=8.0): folder_path = str(self.local_path)+'/'+'douban_'+str(self.movie_class_name)+'.txt' with open(folder_path, 'w', encoding='utf-8', errors='ignore') as f: n = 0 while n < num: if (movies[n][2] >= rate): f.write(str(n)) f.write('\t') for item in movies[n]: f.write(str(item)) f.write('\t') f.write("\n") n = n + 1 else: break def print_movie_content(self,movies): for movie in movies: print(str(movie) + ' ', end='') print(end='\n') def start(self,): print ("Process Start: "+str(self.movie_class_name)) self.get_movie_id() init_movies = sorted(self.movie_list, key=lambda movie: movie[2], reverse=True) self.save_movie_content(init_movies,10,8.0) #打印出10部超过8.0评分的电影,如果没有10部,只打印出来有的 def main(): #使用的是多线程 local_time = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()) os.mkdir(local_time) for movie_class_name in movie_spider.m_movie_class: ms = movie_spider(movie_class_name,movie_spider.m_movie_class[movie_class_name],300,local_time) #300为查询电影的个数 p = multiprocessing.Process(target=ms.start) p.start() if __name__ == '__main__': main()