爬取模块(WebMagic)
爬取模块(WebMagic)
一、概述
该项目的职位数据主要来自爬虫自动爬取,现已对拉勾网、51job等网站的职位、公司信息进行成功爬取。本爬虫模块采用的是WebMagic框架,使用HttpClient生成post请求,然后将爬取的信息筛选存储至MySQL数据库。其中souzhi-crawler-service源代码可自行点击查看。
二、组织架构
souzhi-crawler-service
│ pom.xml
│
└─src
├─main
│ ├─java
│ │ └─com
│ │ └─couragehe
│ │ └─souzhi
│ │ │ SouzhiCrawlerServiceApplication.java
│ │ │
│ │ └─crawler
│ │ ├─mapper
│ │ │ CompanyMapper.java
│ │ │ PositionDetailMapper.java
│ │ │ PositionMapper.java
│ │ │
│ │ └─task
│ │ │ Application.java
│ │ │ TaskTest.java
│ │ │ WebsiteDownloader.java
│ │ │ WebsitePipeline.java
│ │ │ WebsiteProcessor.java
│ │ │
│ │ └─website
│ │ Job51Spider.java
│ │ LagouSpider.java
│ │ WebsiteSpider.java
│ │
│ └─resources
│ │ application.properties
│ │
│ ├─mapper
│ │ CompanyMapper.xml
│ │ PositionDetailMapper.xml
│ │ PositionMapper.xml
│ │
│ ├─static
│ └─templates
└─test
└─java
└─com
└─couragehe
└─souzhi
CrawlerTest.java
SouzhiCrawlerServiceApplicationTests.java 三、maven依赖
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.couragehe.souzhi</groupId>
<artifactId>souzhi-parent</artifactId>
<version>0.0.1-SNAPSHOT</version>
</parent>
<groupId>com.couragehe.souzhi</groupId>
<artifactId>souzhi-crawler-service</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>souzhi-crawler-service</name>
<description>Demo project for Spring Boot</description>
<properties>
<java.version>1.8</java.version>
</properties>
<dependencies>
<dependency>
<groupId>com.couragehe.souzhi</groupId>
<artifactId>souzhi-api</artifactId>
<version>0.0.1-SNAPSHOT</version>
<exclusions>
<exclusion>
<groupId>ch.qos.logback</groupId>
<artifactId>logback-classic</artifactId>
</exclusion>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>log4j-over-slf4j</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>com.couragehe.souzhi</groupId>
<artifactId>souzhi-service-util</artifactId>
<version>0.0.1-SNAPSHOT</version>
</dependency>
<!-- SpringBoot 热部署插件 -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-devtools</artifactId>
<optional>true</optional>
</dependency>
<!-- WebMagic核心包 -->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.3</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.3</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
</plugin>
</plugins>
</build>
</project>
四、数据库设计
1、数据库模型

2、SQL源码
-- ---------------------------- -- Table structure for company -- ---------------------------- DROP TABLE IF EXISTS `company`; CREATE TABLE `company` ( `id` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL, `company_name` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL, `company_logo` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '公司图标', `company_size` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '公司规模', `industry_field` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '行业领域', `finance_stage` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '融资阶段', `company_label_list` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '公司优待标签', PRIMARY KEY (`id`) USING BTREE ) ENGINE = InnoDB CHARACTER SET = utf8 COLLATE = utf8_bin ROW_FORMAT = Compact; -- ---------------------------- -- Table structure for position -- ---------------------------- DROP TABLE IF EXISTS `position`; CREATE TABLE `position` ( `id` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL, `position_name` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL, `company_id` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL, `company_name` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '公司名称', `skill_lables` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '技能清单', `create_time` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '职务创建时间', `format_create_time` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '格式化后的时间', `work_city` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '工作地点', `work_district` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '工作地区', `work_salary` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '工作薪资', `work_year` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '工作经验', `work_nature` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '工作性质', `education_require` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT ***历要求', `position_advantage` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '职位优势', `is_school_job` int(32) NULL DEFAULT 0 COMMENT '是否校招', `detail_url` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '详情链接', `origin_website` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '来源网站', PRIMARY KEY (`id`) USING BTREE ) ENGINE = InnoDB CHARACTER SET = utf8 COLLATE = utf8_bin ROW_FORMAT = Compact; -- ---------------------------- -- Table structure for position_detail -- ---------------------------- DROP TABLE IF EXISTS `position_detail`; CREATE TABLE `position_detail` ( `id` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL, `position_id` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL, `position_desc` text CHARACTER SET utf8 COLLATE utf8_bin NULL COMMENT '职位描述', `position_address` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '职位具体地址' ) ENGINE = InnoDB CHARACTER SET = utf8 COLLATE = utf8_bin ROW_FORMAT = Compact;

查看7道真题和解析