无任何干货,仅供复制
程序说明:
1. 分析一个应该的访问日志文件,找出每个用户ID的访问次数。日志格式基本上是:"2012-10-26 14:41:30,748 userNameId-777 from IP-10.232.25.144 invoked URL-http://xxx/hello.jsonp"
2. Standalone模式,但直接用maven项目所依赖的hadoop库,你不必再另装hadoop
<!-- pom.xml --> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-core</artifactId> <version>1.0.4</version> </dependency>
//Mapper public class Coupon11LogMapper extends Mapper<LongWritable, Text, Text, LongWritable> { @Override protected void map(LongWritable key, Text value, Context context) throws java.io.IOException, InterruptedException { String line = value.toString(); String accessRegex = ".*userNameId\\-(\\d+).*"; Pattern pattern = Pattern.compile(accessRegex); Matcher matcher = pattern.matcher(line); if (!matcher.find()) { return; } String userNameId = matcher.group(1); context.write(new Text(userNameId), new LongWritable(1l)); }; }
//Reducer public class Coupon11LogReducer extends Reducer<Text, LongWritable, Text, LongWritable> { @Override protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException { Long sum = 0l; for (LongWritable value : values) { sum = sum + value.get(); } context.write(key, new LongWritable(sum)); } }
//Job Runner public class Coupon11LogJobMain { public static void main(String[] args) throws Exception { String inputFile = "/home/kent/dev/hadoop/bigdata/coupon11/coupon11.log"; String outDir = "/home/kent/dev/hadoop/bigdata/coupon11/output" + System.currentTimeMillis(); Job job = new Job(); job.setJarByClass(Coupon11LogJobMain.class); FileInputFormat.addInputPaths(job, inputFile); FileOutputFormat.setOutputPath(job, new Path(outDir)); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); job.setMapperClass(Coupon11LogMapper.class); job.setReducerClass(Coupon11LogReducer.class); System.exit(job.waitForCompletion(true) ? 0 : 1); } }