电子商城商品爬取
1.获取商品列表页链接
2.爬取列表页厘米商品信息
3.获取列表页的分页信息
4.根据商品信息中的商品url,爬取商品详情页信息
5.将数据保存在本地
下面以爬取https://www.jollychic.com网站某些列表页为例子:
import requests,random,os,xlwt,math,time,re,pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
#STEP 1.1
# 获得主页下商品的分类链接
def getCateUrl():
cate_list = [
#{ 'cate_name': 'teen-girl' ,'cate_url' :'https://www.jollychic.com/teen-girl-c6863'},
#{ 'cate_name': 'baby-clothing' ,'cate_url' :'https://www.jollychic.com/baby-clothing-c6871'},
#{ 'cate_name': 'toddler-boy' ,'cate_url' :'https://www.jollychic.com/toddler-boy-c6869'},
#{ 'cate_name': 'teen-boy' ,'cate_url' :'https://www.jollychic.com/teen-boy-c6865'},
#{ 'cate_name': 'toddler-girl' ,'cate_url' :'https://www.jollychic.com/toddler-girl-c6867'},
#{ 'cate_name': 'womens-dresses' ,'cate_url' :'https://www.jollychic.com/womens-dresses-c6?SPM=CAT.WOMEN.C2.C6'},
#{ 'cate_name': 'womens-sweatshirts-hoodies' ,'cate_url' :'https://www.jollychic.com/womens-sweatshirts-hoodies-c215?SPM=CAT.WOMEN.C2.C215'},
#{ 'cate_name': 'womens-knitwear' ,'cate_url' :'https://www.jollychic.com/womens-knitwear-c14?SPM=CAT.WOMEN.C2.C14'},
#{ 'cate_name': 'womens-jackets-coats' ,'cate_url' :'https://www.jollychic.com/womens-jackets-coats-c8?SPM=CAT.WOMEN.C2.C8'},
#{ 'cate_name': 'womens-sports-shoes' ,'cate_url' :'https://www.jollychic.com/womens-sports-shoes-c964?SPM=CAT.WOMEN.C59.C964'},
#{ 'cate_name': 'womens-slippers' ,'cate_url' :'https://www.jollychic.com/womens-slippers-c123?SPM=CAT.WOMEN.C59.C123'},
#{ 'cate_name': 'mens-t-shirts-vests' ,'cate_url' :'https://www.jollychic.com/mens-t-shirts-vests-c352?SPM=CAT.MEN.C324.C352'},
#{ 'cate_name': 'mens-sweatshirts-hoodies' ,'cate_url' :'https://www.jollychic.com/mens-sweatshirts-hoodies-c340?SPM=CAT.MEN.C324.C340'},
#{ 'cate_name': 'mens-shirts' ,'cate_url' :'https://www.jollychic.com/mens-shirts-c356?SPM=CAT.MEN.C324.C356'},
{ 'cate_name': 'womens-backpacks' ,'cate_url' :'https://www.jollychic.com/womens-backpacks-c40?SPM=CAT.WOMEN.C35.C40'},
{ 'cate_name': 'womens-shoulder-bags' ,'cate_url' :'https://www.jollychic.com/womens-shoulder-bags-c41?SPM=CAT.WOMEN.C35.C41'},
#{ 'cate_name': 'womens-crossbody-bags' ,'cate_url' :'https://www.jollychic.com/womens-crossbody-bags-c94?SPM=CAT.WOMEN.C35.C94'},
#{ 'cate_name': 'womens-sports-shoes' ,'cate_url' :'https://www.jollychic.com/womens-sports-shoes-c964?SPM=CAT.WOMEN.C59.C964'},
#{ 'cate_name': 'womens-boots' ,'cate_url' :'https://www.jollychic.com/womens-boots-c77?SPM=CAT.WOMEN.C59.C77'},
]
return cate_list
#STEP1.2
#获得已经爬取的商品url和对应的一级分类,二级分类,三级分类,sku
def getExistInfoList(exsit_excel_path):
exist_pd = pd.read_excel(exsit_excel_path)
exist_pd = exist_pd.drop_duplicates()
print(exist_pd.columns)
product_link_list = list(exist_pd['product_link'])
sku_list = list(exist_pd['sku'])
cate_1_list = list(exist_pd['cate_1'])
cate_2_list = list(exist_pd['cate_2'])
cate_3_list = list(exist_pd['cate_3'])
exist_info = {
'product_link' : product_link_list,
'sku' : sku_list,
'cate_1' : cate_1_list,
'cate_2' : cate_2_list,
'cate_3' : cate_3_list
}
return exist_info
#STEP 2
# 获得已经爬取的链接
def getDoneUrl(path):
done_url = []
with open(os.path.join(path, 'record.txt'), 'r', encoding="utf-8") as f:
url_list = f.readlines()
for url in url_list:
done_url.append(url.rstrip('\n'))
print(done_url)
return done_url
#STEP 3
#获得静态的界面
def get_static_html(site_url):
proxies = {'http':"http://45.79.40.158:8116"
}
headers_list = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0 ',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
]
headers = {
'user-agent': headers_list[random.randint(0,len(headers_list))-1],
'Connection': 'keep - alive'
}
try:
resp = requests.get(site_url, headers=headers,proxies=proxies)
except Exception as inst:
print(inst)
requests.packages.urllib3.disable_warnings()
resp = requests.get(site_url, headers=headers,verify=False)
soup = BeautifulSoup(resp.text, 'html.parser')
return soup
#STEP 4
#获得分类下的总页数
def getTotalPageNums(url):
soup = get_static_html(url + '?regioncode=sa')
page_msg_tage = soup.select('.ui-page-last')
if len(page_msg_tage) == 0:
return 0
page_msg = page_msg_tage[0].text
print(url,',',page_msg)
return int(page_msg)
#STEP 5.1
#获得商品详情页信息
def singlePage(url):
soup = get_static_html(url)
if len(soup.select('.categoryTwo-navTag')) == 0:
return [],[]
bread_tag = soup.select('.categoryTwo-navTag')[0]
bread_msg = bread_tag.text
bread_tmp_arr = bread_msg.strip().split(' ')
bread_arr = [x.strip() for x in bread_tmp_arr if x != '']
title_tag = soup.select('.goods-title')[0]
title_msg = title_tag.text.strip()
title_arr = title_msg.split(' #')
return bread_arr,title_arr
#STEP 5.2
#判断shi
#STEP 5.3
#获得一个列表页的商品信息
def getInfoBySoup(url,cate):
soup = get_static_html(url)
product_tage = soup.select('.J-categoryTwo-goodsList > li')
if len(product_tage) == 0:
print(soup.prettify())
return []
else:
info_list = []
for tag in product_tage:
if len(tag.select('.pro_list_msg_1')) > 0:
info = {'cate_url': url, 'cate': cate}
desc_tag = tag.select('.pro_list_msg_1')[0]
info['desc'] = desc_tag.text
if len(tag.select('.normal-price_1')) > 0:
original_price_tage = tag.select('.normal-price_1')
sale_price_tage = tag.select('.normal-price_1')
else:
original_price_tage = tag.select('.original-price_1')
sale_price_tage = tag.select('.discount-price_1')
info['original_price'] = original_price_tage[0].text
info['sale_price'] = sale_price_tage[0].text
price_pattern = re.compile(r'([0-9 .]+)')
info['original_price_num'] = price_pattern.findall(info['original_price'])[0]
info['sale_price_num'] = price_pattern.findall(info['sale_price'])[0]
link_tage = tag.select('a')
info['product_link'] = 'https://www.jollychic.com' + link_tage[1].attrs['href']
info['cate_1'] = ''
info['cate_2'] = ''
info['cate_3'] = ''
info['sku'] = ''
bread_arr,title_arr = singlePage(info['product_link'])
if len(bread_arr) >3 :
info['cate_1'] = bread_arr[1]
info['cate_2'] = bread_arr[2]
info['cate_3'] = bread_arr[3]
if len(title_arr) > 1:
info['sku'] = title_arr[1]
print(info)
info_list.append(info)
else:
print('无商品,,,,',tag)
return info_list
#STEP 6
#将该商品页的商品持久化到excel表格
def exportTask(heads,task_done,path,filename):
if not os.path.exists(path):
os.makedirs(path)
task_xls = xlwt.Workbook(encoding='utf-8')
task_sheet1 = task_xls.add_sheet('sheet1')
#表头
header_allign = xlwt.Alignment()
header_allign.horz = xlwt.Alignment.HORZ_CENTER
header_style = xlwt.XFStyle()
header_style.alignment = header_allign
for i in range(len(heads)):
task_sheet1.col(i).width = 12000
task_sheet1.write(0,i,heads[i],header_style)
#开始插入
for i in range(len(task_done)):
for j in range(len(heads)):
task_sheet1.write(i+1,j,task_done[i][heads[j]])
filename = "{0}.xls".format(filename.replace(':','-'))
print(os.path.join(path,filename))
task_xls.save(os.path.join(path,filename))
return filename
#STEP 7
# 爬取一个分类的链接
def dowloadExcelByCate2(cate_url, cate, path, num):
done_url = getDoneUrl(save_path)
total_num = getTotalPageNums(cate_url)
for i in range(1,total_num+1 ):
url = '{1}?regioncode=sa&jsort=001{0}-120&SPM=DL.X.X®ioncode=sa'.format(i,cate_url)
print('now_page:',url)
if url not in done_url:
items = getInfoBySoup(url,cate)
heads = ['cate_url', 'cate', 'desc', 'sale_price', 'sale_price_num', 'original_price', 'original_price_num',
'product_link','cate_1','cate_2','cate_3','sku']
filename = '{0}-{1}-{2}'.format(num, cate,i)
exportTask(heads, items, path, filename)
try:
with open(os.path.join(path, 'record.txt'), 'a+', encoding="utf-8") as f:
f.write(url + '\n')
f.close()
except Exception as e:
print(e)
#STEP 8
# 合并为一个表格
def connectToOne(dir,to_dir,out_file_name):
excel_list = []
for file in os.listdir(dir):
if file.endswith('.xls'):
print("file:",file)
excel_list.append(pd.read_excel(os.path.join(dir,file),dtype={'cate_url': str,'product_link': str},))
print('开始合并')
total_excel = pd.concat(excel_list)
print('生成文件')
writer = pd.ExcelWriter(os.path.join(to_dir,out_file_name), engine='xlsxwriter', options={'strings_to_urls': False})
print(os.path.join(to_dir,out_file_name),writer)
total_excel.to_excel(writer,index=False)
writer.close()
def main_hh(save_path,out_path,out_file_name):
if not os.path.exists(save_path):
os.makedirs(save_path)
if not os.path.exists(out_path):
os.makedirs(out_path)
with open(os.path.join(save_path, 'record.txt'),'a+') as f:
f.write('')
cate_url_list = getCateUrl()
done_url = getDoneUrl(save_path)
for i in range(len(cate_url_list)):
cate = cate_url_list[i]
if cate['cate_url'] not in done_url:
print('loading........',cate['cate_url'])
dowloadExcelByCate2(cate['cate_url'], cate['cate_name'], save_path, i + 1)
javascript:void(0);
connectToOne(save_path, out_path, out_file_name)
if __name__ == '__main__':
exsit_excel_file_name = 'jollychic1129_finally.xlsx'
out_file_name = 'jollychic1130_finally.xlsx'
save_path = 'C:\\Users\\SHEIN\\Desktop\\site_scrap\\AR\\jollychic\\women1130'
out_path = 'C:\\Users\\SHEIN\\Desktop\\site_scrap\\AR\\jollychic\\site_out\\'
main_hh(save_path, out_path, out_file_name)
