BIG_DATA杂谈(2)-hadoop探究《三国演义》的词频-改

与传统文化的碰撞

  • 虽然这个时代日新月异,但古老的故事依然让人们留恋。
  • 此次用hadoop从另一个方面,探究一下脍炙人口的《三国演义》。

原理

难点

  • 编译与运行包含第三方jar包的hadoop程序。

准备工作

  • 启动hdfs与yarn

    1
    2
    3
    4
    cd ~/hello/bigdata/hadoop-2.7.1 #hadoop所在文件夹
    sbin/start-dfs.sh
    sbin/start-yarn.sh
    jps
  • hadoop执行准备工作

    1
    2
    3
    export JAVA_HOME="/usr/lib/jvm/java-7-openjdk-i386"
    export PATH=${JAVA_HOME}/bin:${PATH}
    export HADOOP_CLASSPATH=${JAVA_HOME}/lib/tools.jar
  • 编写WordCountOrder 进行词频统计

    • cd ~/hello/bigdata/hadoop_ex/wordcount
      WordCountOrder.java
      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      15
      16
      17
      18
      19
      20
      21
      22
      23
      24
      25
      26
      27
      28
      29
      30
      31
      32
      33
      34
      35
      36
      37
      38
      39
      40
      41
      42
      43
      44
      45
      46
      47
      48
      49
      50
      51
      52
      53
      54
      55
      56
      57
      58
      59
      60
      61
      62
      63
      64
      65
      66
      67
      68
      69
      70
      71
      72
      73
      74
      75
      76
      77
      78
      79
      80
      81
      82
      83
      84
      85
      86
      87
      88
      89
      90
      91
      92
      93
      94
      95
      96
      97
      98
      99
      100
      101
      102
      103
      104
      105
      106
      107
      108
      109
      110
      111
      112
      113
      114
      115
      116
      117
      118
      119
      120
      121
      122
      123
      124
      125
      126
      127
      128
      129
      130
      131
      132
      133
      134
      135
      136
      137
      138
      139
      140
      141
      142
      143
      144
      145
      146
      147
      148
      149
      150
      151
      152
      153
      154
      155

      import java.io.IOException;
      import java.util.Random;
      import java.util.StringTokenizer;

      import org.apache.hadoop.conf.Configuration;

      import org.apache.hadoop.fs.FileSystem;
      import org.apache.hadoop.fs.Path;

      import org.apache.hadoop.io.IntWritable;
      import org.apache.hadoop.io.Text;
      import org.apache.hadoop.io.WritableComparable;

      import org.apache.hadoop.mapreduce.Job;
      import org.apache.hadoop.mapreduce.Mapper;
      import org.apache.hadoop.mapreduce.Reducer;
      import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
      import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
      import org.apache.hadoop.mapreduce.lib.map.InverseMapper;
      import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
      import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;

      import org.apache.hadoop.util.GenericOptionsParser;

      import java.util.Iterator;

      import org.apache.lucene.analysis.TokenStream;
      import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
      import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
      import org.apache.lucene.analysis.util.CharArraySet;
      import org.apache.lucene.util.Version;

      public class WordCountOrder {

      public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> {

      private final static IntWritable one = new IntWritable(1);
      private Text word = new Text();

      public void map(Object key, Text value, Context context)
      throws IOException, InterruptedException {

      /*
      String[] self_stop_words = { "的", "在","了", "呢", "是"};
      CharArraySet cas = new CharArraySet(0, true);
      for(int i = 0; i < self_stop_words.length; i++) {
      cas.add(self_stop_words[i]);
      }
      // 加入系统默认停用词
      Iterator<Object> itor = SmartChineseAnalyzer.getDefaultStopSet().iterator();
      while (itor.hasNext()) {
      cas.add(itor.next());
      }
      */

      // 中英文混合分词器
      SmartChineseAnalyzer sca = new SmartChineseAnalyzer();
      //SmartChineseAnalyzer sca = new SmartChineseAnalyzer(cas);

      TokenStream ts = sca.tokenStream("field", value.toString());
      CharTermAttribute ch = ts.addAttribute(CharTermAttribute.class);

      ts.reset();
      while (ts.incrementToken()) {
      word.set(ch.toString());
      context.write(word, one);
      }
      ts.end();
      ts.close();

      }
      }

      public static class IntSumReducer extends
      Reducer<Text, IntWritable, Text, IntWritable> {
      private IntWritable result = new IntWritable();
      public void reduce(Text key, Iterable<IntWritable> values,
      Context context) throws IOException, InterruptedException {
      int sum = 0;
      for (IntWritable val : values) {
      sum += val.get();
      }
      result.set(sum);
      context.write(key, result);
      }
      }

      private static class IntWritableDecreasingComparator extends IntWritable.Comparator {
      public int compare(WritableComparable a, WritableComparable b) {
      return -super.compare(a, b);
      }

      public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
      return -super.compare(b1, s1, l1, b2, s2, l2);
      }
      }


      public static void main(String[] args) throws Exception {

      Configuration conf = new Configuration();
      String[] otherArgs = new GenericOptionsParser(conf, args)
      .getRemainingArgs();
      if (otherArgs.length != 2) {
      System.err.println("Usage: wordcount <in> <out>");
      System.exit(2);
      }
      Path tempDir = new Path("wordcount-temp-" + Integer.toString(
      new Random().nextInt(Integer.MAX_VALUE))); //定义一个临时目录

      Job job = Job.getInstance(conf, "word count");
      job.setJarByClass(WordCountOrder.class);
      try{
      job.setMapperClass(TokenizerMapper.class);
      job.setCombinerClass(IntSumReducer.class);
      job.setReducerClass(IntSumReducer.class);

      job.setOutputKeyClass(Text.class);
      job.setOutputValueClass(IntWritable.class);

      FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
      FileOutputFormat.setOutputPath(job, tempDir);
      //先将词频统计任务的输出结果写到临时目录中, 下一个排序任务以临时目录为输入目录。

      job.setOutputFormatClass(SequenceFileOutputFormat.class);

      if(job.waitForCompletion(true)){ //当word count结束

      Job sortJob = Job.getInstance(conf, "sort");
      sortJob.setJarByClass(WordCountOrder.class);

      FileInputFormat.addInputPath(sortJob, tempDir);
      sortJob.setInputFormatClass(SequenceFileInputFormat.class);

      sortJob.setMapperClass(InverseMapper.class);
      //InverseMapper作用是实现map()之后的数据对的key和value交换
      sortJob.setNumReduceTasks(1);
      // Reducer 的个数限定为1, 最终输出的结果文件就是一个
      FileOutputFormat.setOutputPath(sortJob, new Path(otherArgs[1]));

      sortJob.setOutputKeyClass(IntWritable.class);
      sortJob.setOutputValueClass(Text.class);
      /* Hadoop 默认对 IntWritable 按升序排序,而我们需要的是按降序排列。
      * 因此实现了 IntWritableDecreasingComparator 类, 
      * 并指定使用这个自定义的 Comparator 类对输出结果中的 key (词频)进行排序*/
      sortJob.setSortComparatorClass(IntWritableDecreasingComparator.class);

      System.exit(sortJob.waitForCompletion(true) ? 0 : 1);
      }
      }finally{
      FileSystem.get(conf).deleteOnExit(tempDir);
      }
      }
      }
  • 编译java代码并打包

    1
    2
    3
    4
     
    hadoop="../../hadoop-2.7.1/bin/hadoop"
    javac -cp `$hadoop classpath`':lucene-core-4.10.1.jar:lucene-analyzers-common-4.10.1.jar:lucene-analyzers-smartcn-4.10.1.jar' WordCountOrder.java
    jar cf wcr.jar WordCountOrder*.class
  • hadoop-001

  • 察看hdfs内容

  • ../../hadoop-2.7.1/bin/hdfs dfs -ls -R /

  • hadoop-002

  • 运行hadoop程序

    1
    2
    3
     
    export LIBJARS=lucene-core-4.10.1.jar,lucene-analyzers-common-4.10.1.jar,lucene-analyzers-smartcn-4.10.1.jar
    ../../hadoop-2.7.1/bin/hadoop jar wcr.jar WordCountOrder -libjars ${LIBJARS} /hadoop/test3 /hadoop/out
  • 运行察看界面

  • hadoop-003

  • 察看词频统计结果

  • ../../hadoop-2.7.1/bin/hdfs dfs -cat /hadoop/out/part-r-00000 > out.txt

    • cat out.txt | grep ‘[[:digit:]]{1,}[[:blank:]].{1}$’|head

    • 1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      8503    曰
      7077 之
      3880 不
      3730 兵
      3495 人
      3087 一
      2590 有
      2516 军
      2389 大
      2343 于
    • cat out.txt | grep ‘[[:digit:]]{1,}[[:blank:]].{2}$’|head

    • 1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      880 曹操
      837 将军
      541 司马
      512 丞相
      494 关公
      420 不可
      406 荆州
      372 夏侯
      367 如此
      321 主公
    • cat out.txt | grep ‘[[:digit:]]{1,}[[:blank:]].{3}$’|head

    • 1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      146   诸葛亮
      48 大将军
      34 刀斧手
      30 中郎将
      29 阳平关
      28 不得已
      24 大丈夫
      23 不可不
      18 弓弩手
      15 东南风
    • cat out.txt | grep ‘[[:digit:]]{1,}[[:blank:]].{4,}$’|head

    • 1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      24    决一死战
      22 措手不及
      18 不计其数
      16 深沟高垒
      15 按兵不动
      14 所到之处
      13 勃然大怒
      13 出其不意
      12 人困马乏
      11 将计就计

结束语

  • 从短短几字就不难看出,《三国演义》的主要人物、地点、策略。