电子商城商品爬取

1.获取商品列表页链接

2.爬取列表页厘米商品信息

3.获取列表页的分页信息

4.根据商品信息中的商品url，爬取商品详情页信息

5.将数据保存在本地

下面以爬取https://www.jollychic.com网站某些列表页为例子：

import  requests,random,os,xlwt,math,time,re,pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver


#STEP 1.1
# 获得主页下商品的分类链接
def getCateUrl():
    cate_list = [
                 #{ 'cate_name': 'teen-girl' ,'cate_url' :'https://www.jollychic.com/teen-girl-c6863'},
                 #{ 'cate_name': 'baby-clothing' ,'cate_url' :'https://www.jollychic.com/baby-clothing-c6871'},
                 #{ 'cate_name': 'toddler-boy' ,'cate_url' :'https://www.jollychic.com/toddler-boy-c6869'},
                 #{ 'cate_name': 'teen-boy' ,'cate_url' :'https://www.jollychic.com/teen-boy-c6865'},
                 #{ 'cate_name': 'toddler-girl' ,'cate_url' :'https://www.jollychic.com/toddler-girl-c6867'},
                 #{ 'cate_name': 'womens-dresses' ,'cate_url' :'https://www.jollychic.com/womens-dresses-c6?SPM=CAT.WOMEN.C2.C6'},
                 #{ 'cate_name': 'womens-sweatshirts-hoodies' ,'cate_url' :'https://www.jollychic.com/womens-sweatshirts-hoodies-c215?SPM=CAT.WOMEN.C2.C215'},
                 #{ 'cate_name': 'womens-knitwear' ,'cate_url' :'https://www.jollychic.com/womens-knitwear-c14?SPM=CAT.WOMEN.C2.C14'},
                 #{ 'cate_name': 'womens-jackets-coats' ,'cate_url' :'https://www.jollychic.com/womens-jackets-coats-c8?SPM=CAT.WOMEN.C2.C8'},
                 #{ 'cate_name': 'womens-sports-shoes' ,'cate_url' :'https://www.jollychic.com/womens-sports-shoes-c964?SPM=CAT.WOMEN.C59.C964'},
                 #{ 'cate_name': 'womens-slippers' ,'cate_url' :'https://www.jollychic.com/womens-slippers-c123?SPM=CAT.WOMEN.C59.C123'},
                 #{ 'cate_name': 'mens-t-shirts-vests' ,'cate_url' :'https://www.jollychic.com/mens-t-shirts-vests-c352?SPM=CAT.MEN.C324.C352'},
                 #{ 'cate_name': 'mens-sweatshirts-hoodies' ,'cate_url' :'https://www.jollychic.com/mens-sweatshirts-hoodies-c340?SPM=CAT.MEN.C324.C340'},
                 #{ 'cate_name': 'mens-shirts' ,'cate_url' :'https://www.jollychic.com/mens-shirts-c356?SPM=CAT.MEN.C324.C356'},
                 { 'cate_name': 'womens-backpacks' ,'cate_url' :'https://www.jollychic.com/womens-backpacks-c40?SPM=CAT.WOMEN.C35.C40'},
                 { 'cate_name': 'womens-shoulder-bags' ,'cate_url' :'https://www.jollychic.com/womens-shoulder-bags-c41?SPM=CAT.WOMEN.C35.C41'},
                 #{ 'cate_name': 'womens-crossbody-bags' ,'cate_url' :'https://www.jollychic.com/womens-crossbody-bags-c94?SPM=CAT.WOMEN.C35.C94'},
                 #{ 'cate_name': 'womens-sports-shoes' ,'cate_url' :'https://www.jollychic.com/womens-sports-shoes-c964?SPM=CAT.WOMEN.C59.C964'},
                 #{ 'cate_name': 'womens-boots' ,'cate_url' :'https://www.jollychic.com/womens-boots-c77?SPM=CAT.WOMEN.C59.C77'},
                 ]
    return cate_list


#STEP1.2
#获得已经爬取的商品url和对应的一级分类，二级分类，三级分类，sku
def getExistInfoList(exsit_excel_path):
    exist_pd = pd.read_excel(exsit_excel_path)
    exist_pd = exist_pd.drop_duplicates()
    print(exist_pd.columns)
    product_link_list = list(exist_pd['product_link'])
    sku_list = list(exist_pd['sku'])
    cate_1_list = list(exist_pd['cate_1'])
    cate_2_list = list(exist_pd['cate_2'])
    cate_3_list = list(exist_pd['cate_3'])
    exist_info = {
        'product_link' : product_link_list,
        'sku' : sku_list,
        'cate_1' : cate_1_list,
        'cate_2' : cate_2_list,
        'cate_3' : cate_3_list
    }
    return exist_info



#STEP 2
# 获得已经爬取的链接
def getDoneUrl(path):
    done_url = []
    with open(os.path.join(path, 'record.txt'), 'r', encoding="utf-8") as f:
        url_list = f.readlines()
        for url in url_list:
            done_url.append(url.rstrip('\n'))
        print(done_url)
    return done_url



#STEP 3
#获得静态的界面
def get_static_html(site_url):
    proxies = {'http':"http://45.79.40.158:8116"
               }
    headers_list = [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0 ',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
    ]
    headers = {
        'user-agent': headers_list[random.randint(0,len(headers_list))-1],
        'Connection': 'keep - alive'
    }
    try:
        resp = requests.get(site_url, headers=headers,proxies=proxies)
    except Exception as inst:
        print(inst)
        requests.packages.urllib3.disable_warnings()
        resp = requests.get(site_url, headers=headers,verify=False)
    soup = BeautifulSoup(resp.text, 'html.parser')
    return soup



#STEP 4
#获得分类下的总页数
def getTotalPageNums(url):
    soup = get_static_html(url + '?regioncode=sa')
    page_msg_tage = soup.select('.ui-page-last')
    if len(page_msg_tage) == 0:
        return 0
    page_msg = page_msg_tage[0].text
    print(url,',',page_msg)
    return int(page_msg)



#STEP 5.1
#获得商品详情页信息
def singlePage(url):
    soup = get_static_html(url)
    if len(soup.select('.categoryTwo-navTag')) == 0:
        return [],[]
    bread_tag = soup.select('.categoryTwo-navTag')[0]
    bread_msg = bread_tag.text
    bread_tmp_arr = bread_msg.strip().split('    ')
    bread_arr = [x.strip() for x in bread_tmp_arr if x != '']
    title_tag = soup.select('.goods-title')[0]
    title_msg = title_tag.text.strip()
    title_arr = title_msg.split('  #')
    return bread_arr,title_arr



#STEP 5.2
#判断shi

#STEP 5.3
#获得一个列表页的商品信息
def getInfoBySoup(url,cate):
    soup = get_static_html(url)
    product_tage = soup.select('.J-categoryTwo-goodsList > li')
    if len(product_tage) == 0:
        print(soup.prettify())
        return []
    else:
        info_list = []
        for tag in product_tage:
            if len(tag.select('.pro_list_msg_1')) > 0:
                info = {'cate_url': url, 'cate': cate}
                desc_tag = tag.select('.pro_list_msg_1')[0]
                info['desc'] = desc_tag.text
                if len(tag.select('.normal-price_1')) > 0:
                    original_price_tage = tag.select('.normal-price_1')
                    sale_price_tage = tag.select('.normal-price_1')
                else:
                    original_price_tage = tag.select('.original-price_1')
                    sale_price_tage = tag.select('.discount-price_1')
                info['original_price'] = original_price_tage[0].text
                info['sale_price'] = sale_price_tage[0].text
                price_pattern = re.compile(r'([0-9 .]+)')
                info['original_price_num'] = price_pattern.findall(info['original_price'])[0]
                info['sale_price_num'] = price_pattern.findall(info['sale_price'])[0]
                link_tage = tag.select('a')
                info['product_link'] = 'https://www.jollychic.com' + link_tage[1].attrs['href']
                info['cate_1'] = ''
                info['cate_2'] = ''
                info['cate_3'] = ''
                info['sku'] = ''
                bread_arr,title_arr = singlePage(info['product_link'])
                if len(bread_arr) >3 :
                    info['cate_1'] = bread_arr[1]
                    info['cate_2'] = bread_arr[2]
                    info['cate_3'] = bread_arr[3]
                if len(title_arr) > 1:
                    info['sku'] = title_arr[1]
                print(info)
                info_list.append(info)
            else:
                print('无商品，，，，',tag)
        return info_list



#STEP 6
#将该商品页的商品持久化到excel表格
def exportTask(heads,task_done,path,filename):
    if not os.path.exists(path):
        os.makedirs(path)
    task_xls = xlwt.Workbook(encoding='utf-8')
    task_sheet1 = task_xls.add_sheet('sheet1')
    #表头
    header_allign = xlwt.Alignment()
    header_allign.horz = xlwt.Alignment.HORZ_CENTER
    header_style = xlwt.XFStyle()
    header_style.alignment = header_allign
    for i in  range(len(heads)):
        task_sheet1.col(i).width = 12000
        task_sheet1.write(0,i,heads[i],header_style)
    #开始插入
    for i in range(len(task_done)):
        for j in range(len(heads)):
            task_sheet1.write(i+1,j,task_done[i][heads[j]])
    filename = "{0}.xls".format(filename.replace(':','-'))
    print(os.path.join(path,filename))
    task_xls.save(os.path.join(path,filename))
    return filename



#STEP 7
# 爬取一个分类的链接
def dowloadExcelByCate2(cate_url, cate, path, num):
    done_url = getDoneUrl(save_path)
    total_num = getTotalPageNums(cate_url)
    for i in range(1,total_num+1 ):
        url = '{1}?regioncode=sa&jsort=001{0}-120&SPM=DL.X.X&regioncode=sa'.format(i,cate_url)
        print('now_page:',url)
        if url not in done_url:
            items = getInfoBySoup(url,cate)
            heads = ['cate_url', 'cate', 'desc', 'sale_price', 'sale_price_num', 'original_price', 'original_price_num',
                     'product_link','cate_1','cate_2','cate_3','sku']
            filename = '{0}-{1}-{2}'.format(num, cate,i)
            exportTask(heads, items, path, filename)
            try:
                with open(os.path.join(path, 'record.txt'), 'a+', encoding="utf-8") as f:
                    f.write(url + '\n')
                    f.close()
            except Exception as e:
                print(e)



#STEP 8
# 合并为一个表格
def connectToOne(dir,to_dir,out_file_name):
    excel_list = []
    for file in os.listdir(dir):
        if file.endswith('.xls'):
            print("file:",file)
            excel_list.append(pd.read_excel(os.path.join(dir,file),dtype={'cate_url': str,'product_link': str},))
    print('开始合并')
    total_excel = pd.concat(excel_list)
    print('生成文件')
    writer = pd.ExcelWriter(os.path.join(to_dir,out_file_name), engine='xlsxwriter', options={'strings_to_urls': False})
    print(os.path.join(to_dir,out_file_name),writer)
    total_excel.to_excel(writer,index=False)
    writer.close()



def main_hh(save_path,out_path,out_file_name):
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    if not os.path.exists(out_path):
        os.makedirs(out_path)
    with open(os.path.join(save_path, 'record.txt'),'a+') as f:
        f.write('')
    cate_url_list = getCateUrl()
    done_url = getDoneUrl(save_path)
    for i in range(len(cate_url_list)):
        cate = cate_url_list[i]
        if cate['cate_url'] not in done_url:
            print('loading........',cate['cate_url'])
            dowloadExcelByCate2(cate['cate_url'], cate['cate_name'], save_path, i + 1)
javascript:void(0);
    connectToOne(save_path, out_path, out_file_name)
if __name__ == '__main__':
    exsit_excel_file_name = 'jollychic1129_finally.xlsx'
    out_file_name = 'jollychic1130_finally.xlsx'
    save_path = 'C:\\Users\\SHEIN\\Desktop\\site_scrap\\AR\\jollychic\\women1130'
    out_path = 'C:\\Users\\SHEIN\\Desktop\\site_scrap\\AR\\jollychic\\site_out\\'
    main_hh(save_path, out_path, out_file_name)