1688 scripy

import  requests,random,os,xlwt,math,time,re,pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver



#下载动态界面
def get_dynamic_html2(site_url):
    print('开始加载',site_url,'动态页面')
    chrome_options = webdriver.ChromeOptions()
    #ban sandbox
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    #use headless
    #chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--ignore-ssl-errors')
    driver = webdriver.Chrome(executable_path=CHROME_DRIVER_PATH,chrome_options=chrome_options)
    #print('dynamic laod web is', site_url)
    driver.set_page_load_timeout(100)
    driver.set_window_size(1920, 1080)
    #driver.set_script_timeout(100)
    try:
        driver.get(site_url)
    except Exception as e:
        driver.execute_script('window.stop()')  # 超出时间则不加载
        print(e, 'dynamic web load timeout')

    time.sleep(2)

    fullpage_screenshot(driver, 8000)

    data2 = driver.page_source
    soup2 = BeautifulSoup(data2, 'html.parser')

    try:
        time.sleep(3)
        driver.quit()
    except:
        pass
    return soup2



#滚动
def fullpage_screenshot(driver,total_height):

    total_width = driver.execute_script("return document.body.offsetWidth")
    #total_height = driver.execute_script("return document.body.parentNode.scrollHeight")
    #total_height = 50000
    viewport_width = driver.execute_script("return document.body.clientWidth")
    viewport_height = driver.execute_script("return window.innerHeight")
    rectangles = []

    i = 0
    while i < total_height:
        ii = 0
        top_height = i + viewport_height

        if top_height > total_height:
            top_height = total_height

        while ii < total_width:
            top_width = ii + viewport_width

            if top_width > total_width:
                top_width = total_width
            rectangles.append((ii, i, top_width, top_height))

            ii = ii + viewport_width

        i = i + viewport_height

    previous = None
    part = 0

    for rectangle in rectangles:
        if not previous is None:
            driver.execute_script("window.scrollTo({0}, {1})".format(rectangle[0], rectangle[1]))
            print("Scrolled To ({0},{1})".format(rectangle[0], rectangle[1]))
            time.sleep(0.5)

        file_name = "part_{0}.png".format(part)
        print("Capturing {0} ...".format(file_name))

        #driver.get_screenshot_as_file(file_name)

        if rectangle[1] + viewport_height > total_height:
            offset = (rectangle[0], total_height - viewport_height)
        else:
            offset = (rectangle[0], rectangle[1])

        print("Adding to stitched image with offset ({0}, {1})".format(offset[0], offset[1]))
        part = part + 1
        previous = rectangle
    print("Finishing chrome full page screenshot workaround...")
    return True



if __name__ == '__main__':

    soup_0 = get_dynamic_html2('https://s.1688.com/selloffer/offer_search.htm?keywords=%C5%B7%C3%C0%C5%AE%D7%B0&n=y&netType=1%2C11%2C16')
    info_tag_list = soup_0.select('.sm-offer-item')
    for info_tag in info_tag_list:
        skc = info_tag.attrs['trace-obj_value']
        a_tag_list = info_tag.select('a')
        #print(a_tag_list)
        img_tag = a_tag_list[0].select('img')[0]

        shop_tag = a_tag_list[2]
        desc = img_tag.attrs['alt']
        url = 'https://detail.1688.com/offer/{0}.html'.format(skc)
        shop_name = shop_tag.text
        shop_url = shop_tag.attrs['href']
        print(desc,url,shop_name,shop_url)
    print(len(info_tag_list))
全部评论

相关推荐

宇信外包 Java 7.5k
点赞 评论 收藏
转发
投递字节跳动等公司10个岗位
点赞 评论 收藏
转发
点赞 收藏 评论
分享
牛客网
牛客企业服务