javamapdb的简单介绍

admin 2022-12-07 21:21:07 439

今天给各位分享javamapdb的知识，其中也会对进行解释，如果能碰巧解决你现在面临的问题，别忘了关注本站，现在开始吧！

本文目录一览：

1、如何使用Hadoop的Partitioner
2、如何运行一个hadoop job
3、如何使用Hadoop的MultipleOutputs进行多文件输出

如何使用Hadoop的Partitioner

Hadoop里面的MapReduce编程模型，非常灵活，大部分环节我们都可以重写它的API，来灵活定制我们自己的一些特殊需求。

今天散仙要说的这个分区函数Partitioner，也是一样如此，下面我们先来看下Partitioner的作用：

对map端输出的数据key作一个散列，使数据能够均匀分布在各个reduce上进行后续操作，避免产生热点区。

Hadoop默认使用的分区函数是Hash Partitioner，源码如下：

/** Partition keys by their {@link Object#hashCode()}. */

public class HashPartitionerK, V extends PartitionerK, V {

/** Use {@link Object#hashCode()} to partition. */

public int getPartition(K key, V value,

int numReduceTasks) {

//默认使用key的hash值与上int的最大值，避免出现数据溢出的情况

return (key.hashCode() Integer.MAX_VALUE) % numReduceTasks;

}

大部分情况下，我们都会使用默认的分区函数，但有时我们又有一些，特殊的需求，而需要定制Partition来完成我们的业务，案例如下：

对如下数据，按字符串的长度分区，长度为1的放在一个，2的一个，3的各一个。

河南省;1

河南;2

中国;3

中国人;4

大;1

小;3

中;11

这时候，我们使用默认的分区函数，就不行了，所以需要我们定制自己的Partition，首先分析下，我们需要3个分区输出，所以在设置reduce的个数时，一定要设置为3，其次在partition里，进行分区时，要根据长度具体分区，而不是根据字符串的hash码来分区。核心代码如下：

/**

* Partitioner

* */

public static class PPartition extends PartitionerText, Text{

@Override

public int getPartition(Text arg0, Text arg1, int arg2) {

/**

* 自定义分区，实现长度不同的字符串，分到不同的reduce里面

* 现在只有3个长度的字符串，所以可以把reduce的个数设置为3

* 有几个分区，就设置为几

* */

String key=arg0.toString();

if(key.length()==1){

return 1%arg2;

}else if(key.length()==2){

return 2%arg2;

}else if(key.length()==3){

return 3%arg2;

}

return 0;

}

全部代码如下：

package com.partition.test;

import java.io.IOException;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapred.JobConf;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Partitioner;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.db.DBConfiguration;

import org.apache.hadoop.mapreduce.lib.db.DBInputFormat;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;

import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

import com.qin.operadb.PersonRecoder;

import com.qin.operadb.ReadMapDB;

/**

* @author qindongliang

* 大数据交流群：376932160

* **/

public class MyTestPartition {

/**

* map任务

* */

public static class PMapper extends MapperLongWritable, Text, Text, Text{

@Override

protected void map(LongWritable key, Text value,Context context)

throws IOException, InterruptedException {

// System.out.println("进map了");

//mos.write(namedOutput, key, value);

String ss[]=value.toString().split(";");

context.write(new Text(ss[0]), new Text(ss[1]));

}

/**

* Partitioner

* */

public static class PPartition extends PartitionerText, Text{

@Override

public int getPartition(Text arg0, Text arg1, int arg2) {

/**

* 自定义分区，实现长度不同的字符串，分到不同的reduce里面

* 现在只有3个长度的字符串，所以可以把reduce的个数设置为3

* 有几个分区，就设置为几

* */

String key=arg0.toString();

if(key.length()==1){

return 1%arg2;

}else if(key.length()==2){

return 2%arg2;

}else if(key.length()==3){

return 3%arg2;

}

return 0;

}

/***

* Reduce任务

* **/

public static class PReduce extends ReducerText, Text, Text, Text{

@Override

protected void reduce(Text arg0, IterableText arg1, Context arg2)

throws IOException, InterruptedException {

String key=arg0.toString().split(",")[0];

System.out.println("key== "+key);

for(Text t:arg1){

//System.out.println("Reduce: "+arg0.toString()+" "+t.toString());

arg2.write(arg0, t);

}

public static void main(String[] args) throws Exception{

JobConf conf=new JobConf(ReadMapDB.class);

//Configuration conf=new Configuration();

conf.set("mapred.job.tracker","192.168.75.130:9001");

//读取person中的数据字段

conf.setJar("tt.jar");

//注意这行代码放在最前面，进行初始化，否则会报

/**Job任务**/

Job job=new Job(conf, "testpartion");

job.setJarByClass(MyTestPartition.class);

System.out.println("模式： "+conf.get("mapred.job.tracker"));;

// job.setCombinerClass(PCombine.class);

job.setPartitionerClass(PPartition.class);

job.setNumReduceTasks(3);//设置为3

job.setMapperClass(PMapper.class);

// MultipleOutputs.addNamedOutput(job, "hebei", TextOutputFormat.class, Text.class, Text.class);

// MultipleOutputs.addNamedOutput(job, "henan", TextOutputFormat.class, Text.class, Text.class);

job.setReducerClass(PReduce.class);

job.setOutputKeyClass(Text.class);

job.setOutputValueClass(Text.class);

String path="hdfs://192.168.75.130:9000/root/outputdb";

FileSystem fs=FileSystem.get(conf);

Path p=new Path(path);

if(fs.exists(p)){

fs.delete(p, true);

System.out.println("输出路径存在，已删除！");

}

FileInputFormat.setInputPaths(job, "hdfs://192.168.75.130:9000/root/input");

FileOutputFormat.setOutputPath(job,p );

System.exit(job.waitForCompletion(true) ? 0 : 1);

}

如何运行一个hadoop job

用hadoop也算有一段时间了，一直没有注意过hadoop运行过程中，产生的数据日志，比如说System打印的日志，或者是log4j，slf4j等记录的日志，存放在哪里，日志信息的重要性，在这里散仙就不用多说了，调试任何程序基本上都得需要分析日志。

hadoop的日志主要是MapReduce程序，运行过程中，产生的一些数据日志，除了系统的日志外，还包含一些我们自己在测试时候，或者线上环境输出的日志，这部分日志通常会被放在userlogs这个文件夹下面，我们可以在mapred-site.xml里面配置运行日志的输出目录，散仙测试文件内容如下:

?xml version="1.0"?

?xml-stylesheet type="text/xsl" href="configuration.xsl"?

!-- Put site-specific property overrides in this file. --

configuration

!-- jobtracker的master地址--

property

namemapred.job.tracker/name

value192.168.75.130:9001/value

/property

property

!-- hadoop的日志输出指定目录--

namemapred.local.dir/name

value/root/hadoop1.2/mylogs/value

/property

/configuration

配置好，日志目录后，我们就可以把这个配置文件，分发到各个节点上，然后启动hadoop。

下面我们看来下在eclipse环境中如何调试，散仙在setup，map和reduce方法中，分别使用System打印了一些数据，当我们使用local方式跑MR程序时候，日志并不会被记录下来，而是直接会在控制台打印，散仙的测试代码如下：

package com.qin.testdistributed;

import java.io.File;

import java.io.FileReader;

import java.io.IOException;

import java.net.URI;

import java.util.Scanner;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.filecache.DistributedCache;

import org.apache.hadoop.fs.FSDataInputStream;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapred.JobConf;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.db.DBConfiguration;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.log4j.pattern.LogEvent;

import org.slf4j.Logger;

import org.slf4j.LoggerFactory;

import com.qin.operadb.WriteMapDB;

/**

* 测试hadoop的全局共享文件

* 使用DistributedCached

* 大数据技术交流群： 37693216

* @author qindongliang

* ***/

public class TestDistributed {

private static Logger logger=LoggerFactory.getLogger(TestDistributed.class);

private static class FileMapper extends MapperLongWritable, Text, Text, IntWritable{

Path path[]=null;

/**

* Map函数前调用

* */

@Override

protected void setup(Context context)

throws IOException, InterruptedException {

logger.info("开始启动setup了哈哈哈哈");

// System.out.println("运行了.........");

Configuration conf=context.getConfiguration();

path=DistributedCache.getLocalCacheFiles(conf);

System.out.println("获取的路径是： "+path[0].toString());

// FileSystem fs = FileSystem.get(conf);

FileSystem fsopen= FileSystem.getLocal(conf);

// FSDataInputStream in = fsopen.open(path[0]);

// System.out.println(in.readLine());

// for(Path tmpRefPath : path) {

// if(tmpRefPath.toString().indexOf("ref.png") != -1) {

// in = reffs.open(tmpRefPath);

// break;

// }

// FileReader reader=new FileReader("file://"+path[0].toString());

// File f=new File("file://"+path[0].toString());

// FSDataInputStream in=fs.open(new Path(path[0].toString()));

// Scanner scan=new Scanner(in);

// while(scan.hasNext()){

// System.out.println(Thread.currentThread().getName()+"扫描的内容: "+scan.next());

// }

// scan.close();

// System.out.println("size: "+path.length);

}

@Override

protected void map(LongWritable key, Text value,Context context)

throws IOException, InterruptedException {

// System.out.println("map aaa");

//logger.info("Map里的任务");

System.out.println("map里输出了");

// logger.info();

context.write(new Text(""), new IntWritable(0));

}

@Override

protected void cleanup(Context context)

throws IOException, InterruptedException {

logger.info("清空任务了。。。。。。");

}

private static class FileReduce extends ReducerObject, Object, Object, Object{

@Override

protected void reduce(Object arg0, IterableObject arg1,

Context arg2)throws IOException, InterruptedException {

System.out.println("我是reduce里面的东西");

}

public static void main(String[] args)throws Exception {

JobConf conf=new JobConf(TestDistributed.class);

//conf.set("mapred.local.dir", "/root/hadoop");

//Configuration conf=new Configuration();

// conf.set("mapred.job.tracker","192.168.75.130:9001");

//读取person中的数据字段

//conf.setJar("tt.jar");

//注意这行代码放在最前面，进行初始化，否则会报

String inputPath="hdfs://192.168.75.130:9000/root/input";

String outputPath="hdfs://192.168.75.130:9000/root/outputsort";

Job job=new Job(conf, "a");

DistributedCache.addCacheFile(new URI("hdfs://192.168.75.130:9000/root/input/f1.txt"), job.getConfiguration());

job.setJarByClass(TestDistributed.class);

System.out.println("运行模式： "+conf.get("mapred.job.tracker"));

/**设置输出表的的信息第一个参数是job任务，第二个参数是表名，第三个参数字段项**/

FileSystem fs=FileSystem.get(job.getConfiguration());

Path pout=new Path(outputPath);

if(fs.exists(pout)){

fs.delete(pout, true);

System.out.println("存在此路径, 已经删除......");

}

/**设置Map类**/

// job.setOutputKeyClass(Text.class);

//job.setOutputKeyClass(IntWritable.class);

job.setMapOutputKeyClass(Text.class);

job.setMapOutputValueClass(IntWritable.class);

job.setMapperClass(FileMapper.class);

job.setReducerClass(FileReduce.class);

FileInputFormat.setInputPaths(job, new Path(inputPath)); //输入路径

FileOutputFormat.setOutputPath(job, new Path(outputPath));//输出路径

System.exit(job.waitForCompletion(true) ? 0 : 1);

}

如何使用Hadoop的MultipleOutputs进行多文件输出

import java.io.IOException;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapred.JobConf;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Partitioner;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.db.DBConfiguration;

import org.apache.hadoop.mapreduce.lib.db.DBInputFormat;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;

import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

import com.qin.operadb.PersonRecoder;

import com.qin.operadb.ReadMapDB;

/***

* @author qindongliang

* 大数据技术交流群:324714439

* **/

public class TestMultiOutput {

/**

* map任务

* **/

public static class PMapper extends Mapperlt;LongWritable, Text, Text, Textgt;{

@Override

protected void map(LongWritable key, Text value,Context context)

throws IOException, InterruptedException {

String ss[]=value.toString().split(";");

context.write(new Text(ss[0]), new Text(ss[1]));

}

public static class PReduce extends Reducerlt;Text, Text, Text, Textgt;{

/**

* 设置多个文件输出

* */

private MultipleOutputs mos;

@Override

protected void setup(Context context)

throws IOException, InterruptedException {

mos=new MultipleOutputs(context);//初始化mos

}

@Override

protected void reduce(Text arg0, Iterablelt;Textgt; arg1, Context arg2)

throws IOException, InterruptedException {

String key=arg0.toString();

for(Text t:arg1){

if(key.equals("中国")){

/**

* 一个参数

* **/

mos.write("china", arg0,t);

} else if(key.equals("美国")){

mos.write("USA", arg0,t);

} else if(key.equals("中国人")){

mos.write("cperson", arg0,t);

}

//System.out.println("Reduce: "+arg0.toString()+" "+t.toString());

}

@Override

protected void cleanup(

Context context)

throws IOException, InterruptedException {

mos.close();//释放资源

}

public static void main(String[] args) throws Exception{

JobConf conf=new JobConf(ReadMapDB.class);

//Configuration conf=new Configuration();

// conf.set("mapred.job.tracker","192.168.75.130:9001");

//读取person中的数据字段

// conf.setJar("tt.jar");

//注意这行代码放在最前面，进行初始化，否则会报

/**Job任务**/

Job job=new Job(conf, "testpartion");

job.setJarByClass(TestMultiOutput.class);

System.out.println("模式： "+conf.get("mapred.job.tracker"));;

// job.setCombinerClass(PCombine.class);

//job.setPartitionerClass(PPartition.class);

//job.setNumReduceTasks(5);

job.setMapperClass(PMapper.class);

/**

* 注意在初始化时需要设置输出文件的名

* 另外名称，不支持中文名，仅支持英文字符

* **/

MultipleOutputs.addNamedOutput(job, "china", TextOutputFormat.class, Text.class, Text.class);

MultipleOutputs.addNamedOutput(job, "USA", TextOutputFormat.class, Text.class, Text.class);

MultipleOutputs.addNamedOutput(job, "cperson", TextOutputFormat.class, Text.class, Text.class);

job.setReducerClass(PReduce.class);

job.setOutputKeyClass(Text.class);

job.setOutputValueClass(Text.class);

String path="hdfs://192.168.75.130:9000/root/outputdb";

FileSystem fs=FileSystem.get(conf);

Path p=new Path(path);

if(fs.exists(p)){

fs.delete(p, true);

System.out.println("输出路径存在，已删除！");

}

FileInputFormat.setInputPaths(job, "hdfs://192.168.75.130:9000/root/input");

FileOutputFormat.setOutputPath(job,p );

System.exit(job.waitForCompletion(true) ? 0 : 1);

}

关于javamapdb和的介绍到此就结束了，不知道你从中找到你需要的信息了吗？如果你还想了解更多这方面的信息，记得收藏关注本站。

标签：javamapdb