电子商城商品爬取
1.获取商品列表页链接
2.爬取列表页厘米商品信息
3.获取列表页的分页信息
4.根据商品信息中的商品url,爬取商品详情页信息
5.将数据保存在本地
下面以爬取https://www.jollychic.com网站某些列表页为例子:
import requests,random,os,xlwt,math,time,re,pandas as pd from bs4 import BeautifulSoup from selenium import webdriver #STEP 1.1 # 获得主页下商品的分类链接 def getCateUrl(): cate_list = [ #{ 'cate_name': 'teen-girl' ,'cate_url' :'https://www.jollychic.com/teen-girl-c6863'}, #{ 'cate_name': 'baby-clothing' ,'cate_url' :'https://www.jollychic.com/baby-clothing-c6871'}, #{ 'cate_name': 'toddler-boy' ,'cate_url' :'https://www.jollychic.com/toddler-boy-c6869'}, #{ 'cate_name': 'teen-boy' ,'cate_url' :'https://www.jollychic.com/teen-boy-c6865'}, #{ 'cate_name': 'toddler-girl' ,'cate_url' :'https://www.jollychic.com/toddler-girl-c6867'}, #{ 'cate_name': 'womens-dresses' ,'cate_url' :'https://www.jollychic.com/womens-dresses-c6?SPM=CAT.WOMEN.C2.C6'}, #{ 'cate_name': 'womens-sweatshirts-hoodies' ,'cate_url' :'https://www.jollychic.com/womens-sweatshirts-hoodies-c215?SPM=CAT.WOMEN.C2.C215'}, #{ 'cate_name': 'womens-knitwear' ,'cate_url' :'https://www.jollychic.com/womens-knitwear-c14?SPM=CAT.WOMEN.C2.C14'}, #{ 'cate_name': 'womens-jackets-coats' ,'cate_url' :'https://www.jollychic.com/womens-jackets-coats-c8?SPM=CAT.WOMEN.C2.C8'}, #{ 'cate_name': 'womens-sports-shoes' ,'cate_url' :'https://www.jollychic.com/womens-sports-shoes-c964?SPM=CAT.WOMEN.C59.C964'}, #{ 'cate_name': 'womens-slippers' ,'cate_url' :'https://www.jollychic.com/womens-slippers-c123?SPM=CAT.WOMEN.C59.C123'}, #{ 'cate_name': 'mens-t-shirts-vests' ,'cate_url' :'https://www.jollychic.com/mens-t-shirts-vests-c352?SPM=CAT.MEN.C324.C352'}, #{ 'cate_name': 'mens-sweatshirts-hoodies' ,'cate_url' :'https://www.jollychic.com/mens-sweatshirts-hoodies-c340?SPM=CAT.MEN.C324.C340'}, #{ 'cate_name': 'mens-shirts' ,'cate_url' :'https://www.jollychic.com/mens-shirts-c356?SPM=CAT.MEN.C324.C356'}, { 'cate_name': 'womens-backpacks' ,'cate_url' :'https://www.jollychic.com/womens-backpacks-c40?SPM=CAT.WOMEN.C35.C40'}, { 'cate_name': 'womens-shoulder-bags' ,'cate_url' :'https://www.jollychic.com/womens-shoulder-bags-c41?SPM=CAT.WOMEN.C35.C41'}, #{ 'cate_name': 'womens-crossbody-bags' ,'cate_url' :'https://www.jollychic.com/womens-crossbody-bags-c94?SPM=CAT.WOMEN.C35.C94'}, #{ 'cate_name': 'womens-sports-shoes' ,'cate_url' :'https://www.jollychic.com/womens-sports-shoes-c964?SPM=CAT.WOMEN.C59.C964'}, #{ 'cate_name': 'womens-boots' ,'cate_url' :'https://www.jollychic.com/womens-boots-c77?SPM=CAT.WOMEN.C59.C77'}, ] return cate_list #STEP1.2 #获得已经爬取的商品url和对应的一级分类,二级分类,三级分类,sku def getExistInfoList(exsit_excel_path): exist_pd = pd.read_excel(exsit_excel_path) exist_pd = exist_pd.drop_duplicates() print(exist_pd.columns) product_link_list = list(exist_pd['product_link']) sku_list = list(exist_pd['sku']) cate_1_list = list(exist_pd['cate_1']) cate_2_list = list(exist_pd['cate_2']) cate_3_list = list(exist_pd['cate_3']) exist_info = { 'product_link' : product_link_list, 'sku' : sku_list, 'cate_1' : cate_1_list, 'cate_2' : cate_2_list, 'cate_3' : cate_3_list } return exist_info #STEP 2 # 获得已经爬取的链接 def getDoneUrl(path): done_url = [] with open(os.path.join(path, 'record.txt'), 'r', encoding="utf-8") as f: url_list = f.readlines() for url in url_list: done_url.append(url.rstrip('\n')) print(done_url) return done_url #STEP 3 #获得静态的界面 def get_static_html(site_url): proxies = {'http':"http://45.79.40.158:8116" } headers_list = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0 ', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36' ] headers = { 'user-agent': headers_list[random.randint(0,len(headers_list))-1], 'Connection': 'keep - alive' } try: resp = requests.get(site_url, headers=headers,proxies=proxies) except Exception as inst: print(inst) requests.packages.urllib3.disable_warnings() resp = requests.get(site_url, headers=headers,verify=False) soup = BeautifulSoup(resp.text, 'html.parser') return soup #STEP 4 #获得分类下的总页数 def getTotalPageNums(url): soup = get_static_html(url + '?regioncode=sa') page_msg_tage = soup.select('.ui-page-last') if len(page_msg_tage) == 0: return 0 page_msg = page_msg_tage[0].text print(url,',',page_msg) return int(page_msg) #STEP 5.1 #获得商品详情页信息 def singlePage(url): soup = get_static_html(url) if len(soup.select('.categoryTwo-navTag')) == 0: return [],[] bread_tag = soup.select('.categoryTwo-navTag')[0] bread_msg = bread_tag.text bread_tmp_arr = bread_msg.strip().split(' ') bread_arr = [x.strip() for x in bread_tmp_arr if x != ''] title_tag = soup.select('.goods-title')[0] title_msg = title_tag.text.strip() title_arr = title_msg.split(' #') return bread_arr,title_arr #STEP 5.2 #判断shi #STEP 5.3 #获得一个列表页的商品信息 def getInfoBySoup(url,cate): soup = get_static_html(url) product_tage = soup.select('.J-categoryTwo-goodsList > li') if len(product_tage) == 0: print(soup.prettify()) return [] else: info_list = [] for tag in product_tage: if len(tag.select('.pro_list_msg_1')) > 0: info = {'cate_url': url, 'cate': cate} desc_tag = tag.select('.pro_list_msg_1')[0] info['desc'] = desc_tag.text if len(tag.select('.normal-price_1')) > 0: original_price_tage = tag.select('.normal-price_1') sale_price_tage = tag.select('.normal-price_1') else: original_price_tage = tag.select('.original-price_1') sale_price_tage = tag.select('.discount-price_1') info['original_price'] = original_price_tage[0].text info['sale_price'] = sale_price_tage[0].text price_pattern = re.compile(r'([0-9 .]+)') info['original_price_num'] = price_pattern.findall(info['original_price'])[0] info['sale_price_num'] = price_pattern.findall(info['sale_price'])[0] link_tage = tag.select('a') info['product_link'] = 'https://www.jollychic.com' + link_tage[1].attrs['href'] info['cate_1'] = '' info['cate_2'] = '' info['cate_3'] = '' info['sku'] = '' bread_arr,title_arr = singlePage(info['product_link']) if len(bread_arr) >3 : info['cate_1'] = bread_arr[1] info['cate_2'] = bread_arr[2] info['cate_3'] = bread_arr[3] if len(title_arr) > 1: info['sku'] = title_arr[1] print(info) info_list.append(info) else: print('无商品,,,,',tag) return info_list #STEP 6 #将该商品页的商品持久化到excel表格 def exportTask(heads,task_done,path,filename): if not os.path.exists(path): os.makedirs(path) task_xls = xlwt.Workbook(encoding='utf-8') task_sheet1 = task_xls.add_sheet('sheet1') #表头 header_allign = xlwt.Alignment() header_allign.horz = xlwt.Alignment.HORZ_CENTER header_style = xlwt.XFStyle() header_style.alignment = header_allign for i in range(len(heads)): task_sheet1.col(i).width = 12000 task_sheet1.write(0,i,heads[i],header_style) #开始插入 for i in range(len(task_done)): for j in range(len(heads)): task_sheet1.write(i+1,j,task_done[i][heads[j]]) filename = "{0}.xls".format(filename.replace(':','-')) print(os.path.join(path,filename)) task_xls.save(os.path.join(path,filename)) return filename #STEP 7 # 爬取一个分类的链接 def dowloadExcelByCate2(cate_url, cate, path, num): done_url = getDoneUrl(save_path) total_num = getTotalPageNums(cate_url) for i in range(1,total_num+1 ): url = '{1}?regioncode=sa&jsort=001{0}-120&SPM=DL.X.X®ioncode=sa'.format(i,cate_url) print('now_page:',url) if url not in done_url: items = getInfoBySoup(url,cate) heads = ['cate_url', 'cate', 'desc', 'sale_price', 'sale_price_num', 'original_price', 'original_price_num', 'product_link','cate_1','cate_2','cate_3','sku'] filename = '{0}-{1}-{2}'.format(num, cate,i) exportTask(heads, items, path, filename) try: with open(os.path.join(path, 'record.txt'), 'a+', encoding="utf-8") as f: f.write(url + '\n') f.close() except Exception as e: print(e) #STEP 8 # 合并为一个表格 def connectToOne(dir,to_dir,out_file_name): excel_list = [] for file in os.listdir(dir): if file.endswith('.xls'): print("file:",file) excel_list.append(pd.read_excel(os.path.join(dir,file),dtype={'cate_url': str,'product_link': str},)) print('开始合并') total_excel = pd.concat(excel_list) print('生成文件') writer = pd.ExcelWriter(os.path.join(to_dir,out_file_name), engine='xlsxwriter', options={'strings_to_urls': False}) print(os.path.join(to_dir,out_file_name),writer) total_excel.to_excel(writer,index=False) writer.close() def main_hh(save_path,out_path,out_file_name): if not os.path.exists(save_path): os.makedirs(save_path) if not os.path.exists(out_path): os.makedirs(out_path) with open(os.path.join(save_path, 'record.txt'),'a+') as f: f.write('') cate_url_list = getCateUrl() done_url = getDoneUrl(save_path) for i in range(len(cate_url_list)): cate = cate_url_list[i] if cate['cate_url'] not in done_url: print('loading........',cate['cate_url']) dowloadExcelByCate2(cate['cate_url'], cate['cate_name'], save_path, i + 1) javascript:void(0); connectToOne(save_path, out_path, out_file_name) if __name__ == '__main__': exsit_excel_file_name = 'jollychic1129_finally.xlsx' out_file_name = 'jollychic1130_finally.xlsx' save_path = 'C:\\Users\\SHEIN\\Desktop\\site_scrap\\AR\\jollychic\\women1130' out_path = 'C:\\Users\\SHEIN\\Desktop\\site_scrap\\AR\\jollychic\\site_out\\' main_hh(save_path, out_path, out_file_name)