利用HttpClient及HtmlCltmlCleaner实现的一个CSDN博客搜索下载爬虫

package com.bigdata;

import org.apache.commons.io.IOUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;

import java.io.*;

public class CSDN {

    private static HtmlCleaner cleaner;
    private static CloseableHttpClient client;

    public static void main(String[] args) throws Exception {
        client = HttpClients.createDefault();
        String soso = "https://so.csdn.net/so/search/s.do?p=1&q=学习&t=blog";
        System.out.println("爬取地址为:" + soso);
        HttpGet httpGet = new HttpGet(soso);
        //点击后返回响应数据包
        CloseableHttpResponse execute = client.execute(httpGet);
        //从响应数据包中提取响应实体
        HttpEntity entity = execute.getEntity();
        //从响应实体中获得输入流
        InputStream inputStream = entity.getContent();
        //从响应实体中获得字符串
        String s = EntityUtils.toString(entity);
        //获得HtmlCleaner对象
        cleaner = new HtmlCleaner();
        TagNode clean = cleaner.clean(s);
        String PageNumberPath = "//a[@class='btn btn-xs btn-default'][5]/text()";
        Object[] objects = clean.evaluateXPath(PageNumberPath);
        String stringPageNumber = objects[0].toString().replaceAll("\\D", "");
        int PageNumber = Integer.parseInt(stringPageNumber);
        for (int i = 1; i <= PageNumber; i++) {
            String url = "https://so.csdn.net/so/search/s.do?p=" + i + "&q=学习&t=blog&domain=&o=&s=&u=&l=&f=&rbg=0";
            DownlodList(url, i);
        }
    }

    //解析文章列表
    public static void DownlodList(String URL, int Page) throws Exception {
        HttpGet httpGet = new HttpGet(URL);
        CloseableHttpResponse execute = client.execute(httpGet);
        HttpEntity entity = execute.getEntity();
        String string = EntityUtils.toString(entity);
        TagNode clean = cleaner.clean(string);
        for (int i = 1; i <= 10; i++) {
            File file = new File("E:/CSDN/"+Page);
            if(!file.exists()){
                file.mkdir();
            }
            String ListPath = "//dl[@class='search-list J_search']["+i+"]/dd[@class='author-time']/span[@class='link']/a";
            Object[] objects = clean.evaluateXPath(ListPath);
            for (Object object : objects) {
                DownloadPage(object.toString(),Page);
            }
        }
    }

    //具体文章解析下载
    public static void DownloadPage(String PageURL,int Page) throws Exception {
        HttpGet httpGet = new HttpGet(PageURL);
        CloseableHttpResponse execute = client.execute(httpGet);
        HttpEntity entity = execute.getEntity();
        String string = EntityUtils.toString(entity);
        TagNode clean = cleaner.clean(string);
        String TitlePath = "//h1[@class='title-article']/text()";
        Object[] Title = clean.evaluateXPath(TitlePath);
        String Context = "//div[@id='content_views']//text()";
        Object[] Contexts = clean.evaluateXPath(Context);
        //创建文件数据输入流
        ByteArrayInputStream fileInputStreams = new ByteArrayInputStream(Contexts[0].toString().getBytes("UTF-8"));
        //创建文件输出流对象
        File file = new File("E:/CSDN/" +Page+"/"+ Title[0] + ".txt");
        //文件不存在则创建
        boolean newFile = false;
        if (!file.exists()) {
            newFile = file.createNewFile();
        }
        FileOutputStream fileOutputStream = new FileOutputStream(file);
        IOUtils.copy(fileInputStreams, fileOutputStream);
    }
}

 

全部评论

相关推荐

家人们,我现在真的好纠结。我是26届的,目前还没有实习过。我现在的情况是,想参加秋招,但是感觉自己的简历特别空,没有实习经历会不会秋招直接凉凉啊?可我又听说现在很多公司对26届实习生也不太感冒,说什么不确定性大。而且我最近在准备考公,时间上也有点冲突。要是把时间花在实习上,备考时间就少了。但要是不实习,又怕以后就业有问题😫有没有懂行的友友帮我分析分析:26届现在不实习,秋招找工作真的会很难吗?考公和实习该怎么平衡啊?如果现在不实习,考完公再去找实习还来得及吗?真的太焦虑了,希望大家能给我点建议🙏
小破站_程序员YT:我可能和大家的观点不一样。人的精力是有限的,不能既要还要。你又想实习又想考公最后又要秋招上岸,我觉得哪有那么多的选择。你如果想考上岸,那就全力以赴。如果想秋招上岸,就继续投实习,投没了,就继续准备秋招,秋招不行继续春招。别到最后,考公没上岸,觉得是花了时间浪费在找实习上了, 秋招没上岸,觉得是浪费时间准备考公去了。我是认为很难说可以去平衡 不喜勿喷,可以叫我删除
实习与准备秋招该如何平衡
点赞 评论 收藏
分享
qq乃乃好喝到咩噗茶:院校后面加上211标签,放大加粗,招呼语也写上211
点赞 评论 收藏
分享
评论
点赞
收藏
分享

创作者周榜

更多
牛客网
牛客网在线编程
牛客网题解
牛客企业服务