


Talk is cheap. Show me the code.--Linus Torvalds







import collections,re
import sys
def cal(filename = 'in.txt'):
	print 'now processing:' + filename + '......'
	f = open(filename,'r')
	data = f.read()
	dic = collections.defaultdict(lambda :0)
	data = re.sub(r'[\W\d]',' ',data)
	data = data.lower()
	datalist = data.split(' ')
	for item in datalist:
		dic[item] += 1
	del dic['']
	return dic
	print sorted(cal().items())
	print 'no input file'
使用mapreduce的思想做了这周的题目,每个文件使用一个线程处理,最后的结果汇总到reduce,reduce把这些结果合并 用到的内容:多线程中不常用的一直方式Callable,正则表达式,HashMap排序,mapreduce思想 Callable可以获取线程执行完的结果,并且可以抛出异常 下面是代码 import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStreamReader; import java.util.HashMap; import java.util.concurrent.Callable; /** * map部分 * @author Matrix42 * */ public class WorldCountMap implements Callable<HashMap<String, Integer>>{ //待处理的文本 private String text; //待处理的文件 private File file; //以单词为key,次数为value的结果 private HashMap<String, Integer> result; public WorldCountMap(File file) { this.file = file; result = new HashMap<String, Integer>(); } /** * 把文件内容读出来存到text中 * @param file * @return */ private String Transform(File file){ BufferedReader reader = null; try { reader = new BufferedReader(new InputStreamReader(new FileInputStream(file))); } catch (FileNotFoundException e) { e.printStackTrace(); } StringBuffer sb = new StringBuffer(); String string; try { while((string = reader.readLine())!=null){ sb.append(string); } } catch (IOException e) { e.printStackTrace(); } try { reader.close(); } catch (IOException e) { e.printStackTrace(); } return sb.toString(); } @Override public HashMap<String, Integer> call() { this.text = Transform(file); //使用正则进行分割 String[] strings = text.split("[^\\w]|[\\d]"); for (String string : strings) { //单词全部转换为小写的 string = string.toLowerCase(); //如果是空就跳过(正则分割时产生的) if (string.equals("")) { continue; } //把单词存入map,单词为key,如果之前不存在则value为1,存在则balue加1 if(result.containsKey(string)){ result.put(string, result.get(string)+1); }else { result.put(string, 1); } } return result; } } import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; /** * reduce部分 * @author Matrix42 * */ public class WorldCountReduce { //存放map处理完的结果 private List<Map<String, Integer>> resultList; //存放reduce处理完的结果 private HashMap<String, Integer> result; //单例对象 private static WorldCountReduce instance; //private的构造方法 private WorldCountReduce() { this.resultList = new ArrayList<Map<String,Integer>>(); result = new HashMap<String, Integer>(); } //获取单例对象的方法 public static WorldCountReduce getInstance(){ if(instance == null){ instance = new WorldCountReduce(); } return instance; } //添加一个待reduce的结果 public void add(Map<String, Integer> res){ resultList.add(res); } //对map结果进行合并,处理方式与map类似 public void calculate(){ for (Map<String, Integer> map : resultList) { for(Entry<String, Integer> entry:map.entrySet()){ String key = entry.getKey(); if(result.containsKey(key)){ result.put(key, result.get(key)+entry.getValue()); }else{ result.put(key, entry.getValue()); } } } } //返回reduce结果 public HashMap<String, Integer> getResult(){ return result; } } import java.io.File; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; /** * 任务管理器,用户不用WorldCountMap和WorldCountReduce打交道 * @author Matrix42 * */ public class TaskManager { private ArrayList<File> tasks; private List<Future> taskList = new ArrayList<Future>(); private WorldCountReduce reduce; private ExecutorService exec; /** * 构造方法,参数为待处理的文本的File对象 * @param tasks */ public TaskManager(ArrayList<File> tasks) { this.tasks = tasks; } /** * 添加一个待处理的文本的File对象 * @param file */ public void addTask(File file){ tasks.add(file); } /** * 开始任务 * 多线程处理 */ public void start(){ exec = Executors.newFixedThreadPool(tasks.size()); for(File task:tasks){ WorldCountMap mapTask = new WorldCountMap(task); taskList.add(exec.submit(mapTask)); } } /** * 结束任务 */ public void shutdownTask(){ exec.shutdownNow(); } /** * 把map的结果传给reduce处理,然后返回最后结果 * @return */ public HashMap<String, Integer> getResult(){ reduce = WorldCountReduce.getInstance(); for(Future future:taskList){ try { reduce.add((Map<String, Integer>) future.get()); } catch (InterruptedException | ExecutionException e) { e.printStackTrace(); } } reduce.calculate(); return reduce.getResult(); } } import java.io.File; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.List; import java.util.Map; import java.util.Map.Entry; public class Client { public static void main(String[] args) { //添加待处理的文件 ArrayList<File> tasks = new ArrayList<>(); tasks.add(new File("D:/a.txt")); tasks.add(new File("D:/b.txt")); tasks.add(new File("D:/c.txt")); TaskManager manager = new TaskManager(tasks); //开始任务 manager.start(); //获取结果 Map<String,Integer> resMap = manager.getResult(); List<Entry<String, Integer>> list = new ArrayList<Entry<String, Integer>>(resMap.entrySet()); //按value排序 Collections.sort(list,new Comparator<Map.Entry<String,Integer>>() { //降序排序 public int compare(Entry<String, Integer> o1, Entry<String, Integer> o2) { return o2.getValue().compareTo(o1.getValue()); } }); //输出 for (Entry<String, Integer> e: list) { System.out.println(e.getKey()+":"+e.getValue()); } //结束任务 manager.shutdownTask(); } } 正则有点捉急,在RegexBuddy和java中结果不一样 结果:
#!/usr/bin/env python # -*- encoding: utf-8 -*- # @Date : 2015-03-07 12:41:14 # @Author : NSSimacer # @Email : wuxiaoqiang1020@gmail.com # @Version : 1.0 import re def count_words(file_name): ''' 统计纯英文文本中单词出现的个数 ''' words = [] words_dict = {} lines_count = 0 with open(file_name, 'r') as f: for line in f: lines_count += 1 # 过滤非英文单词 words.extend(re.findall(r'[a-zA-Z0-9]+', line.strip())) for word in words: if word not in words_dict: words_dict[word] = 1 else: words_dict[word] += 1 return lines_count, len(words), words, words_dict if __name__ == '__main__': file_name = 'plain_text.txt' result = count_words(file_name) print 'In file:', file_name print 'Total Lines:', result[0] print 'Total Words:', result[1] print 'Words:', ', '.join(result[2]) print 'Frequency of each word:' for key, value in result[3].items(): print '%s: %s' % (key, value)
cat filename | awk -F '[\t ]' '{for(i =1;i<=NF;++i)count[$1]++;}END{for (i in count) print count[i]}'  
public static void main(String[] args) throws IOException { // TODO 自动生成的方法存根 Scanner cin =new Scanner(System.in); System.out.println("请输入英文文件目录!"); String filename = cin.nextLine(); if(filename == null) { System.out.println("该文件不存在,请重新输入!"); return ; } BufferedReader in = new BufferedReader(new FileReader(filename)); String s; StringBuilder sb = new StringBuilder(); while((s = in.readLine()) !=null ){ sb.append(s); } in.close(); Pattern pattern; String[] arr = sb.toString().split(" "); Map<String,Integer> map = new HashMap<String,Integer>(); for(int i = 0; i < arr.length; i++){ if(arr[i].matches("^\\w+$")){ if(map.containsKey(arr[i])) map.replace(arr[i], map.get(arr[i])+1); else map.put(arr[i], 1); } } for(String key:map.keySet()) System.out.println(key +": "+ map.get(key)); }
import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; public final class StringCount { // 工具类中的方法都是静态方式访问的因此将构造器私有不允许创建对象(绝对好习惯) private StringCount() { throw new AssertionError(); } public static void main(String[] args) { StringCount.countWordInFile("G:\\a.txt", "b"); } /** * 统计给定文件中给定字符串的出现次数 * @param fileName 文件的路径及名字 * @param word 字符串 * @return 返回字符串出现的次数 */ public static int countWordInFile(String fileName, String word) { int counter = 0; FileReader fr = null; BufferedReader br = null; try { fr = new FileReader(new File(fileName)); br = new BufferedReader(fr); String line= null; while((line = br.readLine()) != null) { int index = -1; while(line.length() >= word.length() && (index = line.indexOf(word)) >= 0) { counter++; line = line.substring(index + word.length()); } } } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { try { br.close(); fr.close(); } catch (IOException e) { e.printStackTrace(); } } System.out.println(counter); return counter; } }
#!/usr/bin/env python #-*- coding: utf-8 -*- from collections import defaultdict import re def replace(s): if s.group(1) == 'n\'t': return s.group(1) return ' ' def cal(filename='in.txt'): print('now processing:' + filename + '......') dic = defaultdict(lambda: 0) with open(filename, 'r') as f: data = f.read() # 全部变为小写字母 data = data.lower() # 替换除了n't这类连字符外的所有非单词字符和数字字符 data = re.sub(r'(n[\']t)|([\W\d])', replace, data) datalist = re.split(r'[\s\n]+', data) for item in datalist: dic[item] += 1 del dic[''] return dic if __name__ == '__main__': try: dic = cal() for key, val in dic.items(): print(key, '--', val) except: print('no input file') 详细的解析写在了博客英文词频统计里,这里主要是对牛客给出的代码改了一下,对于can't这种应该算是一个单词,把这种保留。
import re def findnum(filename): with open(filename, 'r') as fin,open('frequency.txt','w') as fout: text = fin.read() word = re.findall("[a-z]+",str.lower(text)) word_set = list(set(word)) word_set = sorted(word_set) print(word_set) for i in word_set: non_word = re.match("-+",i) if not non_word: fout.write(i +" : "+str(word.count(i))+"\n") if __name__ == '__main__': findnum('a.txt')
# -*- coding: utf-8 -*- import collections import os import re def countword(file): try: f = open(file,'r') dic = collections.defaultdict(lambda: 0) data = f.read() data = data.lower() data = re.sub(r'[\W\d]', ' ', data) datalist = data.split() for item in datalist: if item not in datalist: dic[item] = 1 dic[item] += 1 return dic except: print 'file open error!' if __name__ == '__main__': try: file = "words.txt" res = countword(file) res = sorted(res.iteritems(), key=lambda x : x[1]) for item in res: print item[0] + ":%d" % item[1] except: print 'run time error'
function main(text){ //去掉标点符号 var arrText = text.split(''); for(var i=0; i<arrText.length; i++){ var index = arrText[i].charCodeAt(); if(index<65 && (index != 32) ){ arrText.splice(i,1); } } //始终用第一个单词进行比较,不断更改数组长度 text2 = arrText.join(''); var arr = text2.split(' '); var char = arr[0]; var count = 0; var len = arr.length; while(len>1){ console.log(char); for(var i=0; i<len; i++){ if(arr[i] == char){ count++; arr.splice(i,1); } } len = arr.length; console.log('-------------------'+count); char = arr[0]; count = 0; } } //var text = 'She is not afraid of changeWhile most people rather continue on living unfulfilled lives as long as their comfort zone remains intact, the empowered woman is all about embracing change. She understands growth cannot happen without change. She understands that change is the gift life offers you to choose your destiny. Therefore, she is not afraid of change because it is her stepping stone towards success. '; main(text);
package BeautifulCoding; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.util.HashMap; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; public class Solution8 { public static void main(String[] args)throws Exception{ BufferedReader reader = new BufferedReader(new FileReader(new File("").getAbsolutePath()+"//files//"+"1.txt")); StringBuilder builder = new StringBuilder(); String temp = null; while((temp=reader.readLine())!=null){ builder.append(temp); } printwordsnum(builder.toString()); reader.close(); } public static void printwordsnum(String str){ String regex = "[\\w&&[^\\d]&&[^\\s]]+"; int wordsum = 0; Map<String,Integer> total = new HashMap<>(); Pattern p = Pattern.compile(regex); Matcher m = p.matcher(str); while(m.find()){ wordsum++; String key = m.group(); if(total.containsKey(key)){ int old = total.get(key)+1; total.remove(key); total.put(key, old); }else{ total.put(key,1); } } System.out.println("该文本单词总数是:"+wordsum); System.out.println("其中各单词出现频率详细:"); for(Map.Entry<String,Integer> word : total.entrySet()){ System.out.println(word.getKey()+"出现了:"+word.getValue()+"次"); } } }
