项目分析与小结: http://blog.csdn.net/zhyh1435589631/article/details/53053949 实现环境:  pyspider + centos7 + mysql 5.5 pyspider 部分代码: #!/usr/bin/env python # -*- encoding: utf-8 -*- # Created on 2016-11-05 23:18:55 # Project: taobao_food from pyspider.libs.base_handler import * import re import json import MySQLdb class Handler (BaseHandler): # 数据库链接配置 def __init__ (self): db_host= "127.0.0.1" user= "root" passwd= "zhyh2010" db= "taobao_food" charset= "utf8" conn = MySQLdb.connect(host=db_host, user = user, passwd=passwd, db=db, charset=charset) conn.autocommit( True ) self.db=conn.cursor() # 爬虫的起始url @every(minutes=24 * 60) def on_start (self): self.crawl( 'https://tce.taobao.com/api/mget.htm?callback=jsonp221&tce_sid=659631&tce_vid=8,2&tid=,&tab=,&topic=,&count=,&env=online,online' , callback=self.json_parser) # 解析相应的 json 数据 @config(age=24 * 60 * 60) def select_json (self, response): content = response.text pattern = re.compile( 'window.jsonp.*?\((.*?)\)' , re.S) content_select = re.findall(pattern, content) return content_select[ 0 ].strip() # 提取相应数据 插入数据库表中 def product_info (self, response): for data in response[ "result" ]: res = { "item_pic" : "https:" + data[ "item_pic" ], "item_youhui_price" : data[ "item_youhui_price" ], "item_title" : data[ "item_title" ] } sql = "insert into food_info(url, price, title) values (%s,%s,%s)" values = [(res[ "item_pic" ], res[ "item_youhui_price" ], res[ "item_title" ])] self.db.executemany(sql, values) # 解析 json @config(age=24 * 60 * 60) def json_parser (self, response): content = self.select_json(response) contents = json.loads(content) subres = contents[ "result" ] for each in contents[ "result" ]: info = self.product_info(subres[each])
点赞 7

相关推荐

牛客网
牛客企业服务