MapReduce

MapReduce是一种编程模型,将任务分为两个阶段:Map和Reduce,用户只需编写map()reduce()两个函数就可以完成简单的分布式程序的设计。
MapReduce能够解决的问题有一个共同特点:任务可以被分解成多个子问题,且这些子问题相对独立,彼此之间不会有牵制,待并行完成这些子问题后,任务便被解决。

第一个MapReduce程序(WordCount)

  • 代码

      package edu.hut;
    
      import org.apache.hadoop.conf.Configuration;
      import org.apache.hadoop.fs.Path;
      import org.apache.hadoop.io.IntWritable;
      import org.apache.hadoop.io.LongWritable;
      import org.apache.hadoop.io.Text;
      import org.apache.hadoop.mapreduce.Job;
      import org.apache.hadoop.mapreduce.Mapper;
      import org.apache.hadoop.mapreduce.Reducer;
      import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
      import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
      import java.io.IOException;
    
      public class WordCount {
    
          public static class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
    
              Text k = new Text();
              IntWritable v = new IntWritable(1);
    
              @Override
              protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
                  // 默认读取一行的数据
                  String line = value.toString();
                  String[] words = line.split(" ");
                  // 将行内的内容都写入缓冲区
                  for (String word : words) {
                      k.set(word);
                      context.write(k, v);
                  }
              }
          }
    
          public static class WordCountReducer extends Reducer<Text, IntWritable, Text, LongWritable> {
    
              LongWritable v = new LongWritable();
    
              @Override
              protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
                  long sum = 0;
                  for (IntWritable count : values) {
                      sum += count.get();
                  }
                  v.set(sum);
                  context.write(key, v);
              }
          }
    
          public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
    
              // 在Windows中运行时需要加上这个
              System.setProperty("HADOOP_USER_NAME","atguigu");
              Configuration conf = new Configuration();
              conf.set("fs.default.name", "hdfs://hadoop102:9000");
              Job job = Job.getInstance(conf);
    
              job.setJarByClass(WordCount.class);
    
              job.setMapperClass(WordCountMapper.class);
              job.setReducerClass(WordCountReducer.class);
    
              job.setMapOutputKeyClass(Text.class);
              job.setMapOutputValueClass(IntWritable.class);
              job.setOutputKeyClass(Text.class);
              job.setOutputValueClass(LongWritable.class);
    
              FileInputFormat.setInputPaths(job, new Path(args[0]));
              FileOutputFormat.setOutputPath(job, new Path(args[1]));
    
              boolean result = job.waitForCompletion(true);
              System.exit(result ? 0 : 1);
          }
      }
  • pom文件

      <dependencies>
          <dependency>
              <groupId>junit</groupId>
              <artifactId>junit</artifactId>
              <version>RELEASE</version>
          </dependency>
          <dependency>
              <groupId>org.apache.logging.log4j</groupId>
              <artifactId>log4j-core</artifactId>
              <version>2.8.2</version>
          </dependency>
          <dependency>
              <groupId>org.apache.hadoop</groupId>
              <artifactId>hadoop-common</artifactId>
              <version>2.7.2</version>
          </dependency>
          <dependency>
              <groupId>org.apache.hadoop</groupId>
              <artifactId>hadoop-client</artifactId>
              <version>2.7.2</version>
          </dependency>
          <dependency>
              <groupId>org.apache.hadoop</groupId>
              <artifactId>hadoop-hdfs</artifactId>
              <version>2.7.2</version>
          </dependency>
      </dependencies>
    
      <build>
          <plugins>
              <plugin>
                  <artifactId>maven-compiler-plugin</artifactId>
                  <version>2.3.2</version>
                  <configuration>
                      <source>1.8</source>
                      <target>1.8</target>
                  </configuration>
              </plugin>
              <plugin>
                  <artifactId>maven-assembly-plugin </artifactId>
                  <configuration>
                      <descriptorRefs>
                          <descriptorRef>jar-with-dependencies</descriptorRef>
                      </descriptorRefs>
                      <archive>
                          <manifest>
                              <mainClass>edu.hut.WordCount</mainClass>
                          </manifest>
                      </archive>
                  </configuration>
                  <executions>
                      <execution>
                          <id>make-assembly</id>
                          <phase>package</phase>
                          <goals>
                              <goal>single</goal>
                          </goals>
                      </execution>
                  </executions>
              </plugin>
          </plugins>
      </build>
全部评论

相关推荐

搜索部&nbsp;首先说下timeline8.18,投递8.19,约一面8.21,晚上一面call约二面8.22,上午二面下午oc周末等待(8.23,8.24)8.25,offer一年前,我还是懵懵懂懂,高考完的暑假,只会提前学学高数,未来的画像是什么?我或许无法预测。开学后,自学Python,接单,无数个客户的ddl,偷偷摸摸一个人找自习的地方,这一步步竟然为后来的我,搭建工程能力的基础。大一上,我也要感谢我的第一位老板,让我接触到了实习,师兄带着我一步步入门,看他们写的飞书文档。大一下,导师带我参与企业项目,这让我渐渐发现,应该去实践,增长见识,而非局限当下,盯着自己的小新pro。不久后,第一波投递开始,结果当然是约面极少。盯着简历上的文字和ssob,我开始思考,确实很多可以去提升。带着些许不甘心,继续沉淀,慢慢的约面也越来越多,有的时候两天7场,准备完就接着下一个日程。这一次,也许是刚好到位吧,比较match,面试答的流利,关关难关关过,成为度孝子展望未来,依然是重重挑战,果然只有收到offer的那一刻是开心的。愿在百度星海拆解的每一段代码,都能成为丈量宇宙的诗行;此志终赴星河,而今迈步重铸天阶。屏幕前的你们,在无数个向星海奔赴的日夜,一定一定,会在未来化作群星回响的征程——请永远相信此刻埋首耕耘的自己!!!
一天三顿半:???百度提前批发 offer了?不是统一和正式批排序完再发吗我靠
百度求职进展汇总
点赞 评论 收藏
分享
评论
1
收藏
分享

创作者周榜

更多
牛客网
牛客网在线编程
牛客网题解
牛客企业服务