add a test class

master
ziyonghong 5 years ago
parent e30851d47b
commit 67b32067de

@ -0,0 +1,109 @@
import java.util.ArrayList;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.mllib.classification.NaiveBayes;
import org.apache.spark.mllib.classification.NaiveBayesModel;
import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.mllib.linalg.Vectors;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.junit.Test;
public class BayesTest {
@Test
public void TestA(){
/**
* *线
*/
SparkConf conf = new SparkConf().setAppName("NaiveBayesTest").setMaster("local[*]");
JavaSparkContext sc = new JavaSparkContext(conf);
/**
* MLlibDenseVectorSparseVector
*
*/
/**
* 1 2 3 4 5 6
*/
/**
* ==
* (1.0, 0.0, 1.0, 0.0, 1.0, 0.0
* (1.0, 1.0, 1.0, 1.0, 0.0, 1.0
*/
//稠密向量 == 连续的
Vector vMale = Vectors.dense(1,0,1,0,1,0);
//稀疏向量 == 间隔的、指定的,未指定位置的向量值默认 = 0.0
int len = 6;
int[] index = new int[]{0,1,2,3,5};
double[] values = new double[]{1,1,1,1,1};
//索引0、1、2、3、5位置上的向量值=1索引4没给出默认0
Vector vFemale = Vectors.sparse(len, index, values);
//System.err.println("vFemale == "+vFemale);
/**
* labeled point
* label/response
* MLliblabeled points
* 使doublelabel使labeled points
* label 0 1
* labelsclass00,1,2,......
*/
//训练集生成 规定数据结构为LabeledPoint == 构建方式:稠密向量模式 1.0:类别编号 == 男性
LabeledPoint train_one = new LabeledPoint(1.0,vMale); //(1.0, 0.0, 1.0, 0.0, 1.0, 0.0
//训练集生成 规定数据结构为LabeledPoint == 构建方式:稀疏向量模式 2.0:类别编号 == 女性
LabeledPoint train_two = new LabeledPoint(2.0,vFemale); //(1.0, 1.0, 1.0, 1.0, 0.0, 1.0
//我们也可以给同一个类别增加多个训练集
LabeledPoint train_three = new LabeledPoint(2.0,Vectors.dense(0,1,1,1,0,1));
//List存放训练集【三个训练样本数据】
List<LabeledPoint> trains = new ArrayList<>();
trains.add(train_one);
trains.add(train_two);
trains.add(train_three);
/**
* SPARKRDD()
* SparkScala,JavaRDDSparkJavaAPI
* JavaSparkContext sc = new JavaSparkContext(sparkConf); //对应JavaRDD
* SparkContext sc = new SparkContext(sparkConf) ; //对应RDD
* LabeledPoint
*/
JavaRDD<LabeledPoint> trainingRDD = sc.parallelize(trains);
/**
* SparkRDD
* JavaRDDSparkRDD
*/
NaiveBayesModel nb_model = NaiveBayes.train(trainingRDD.rdd());
//测试集生成 == 以下的向量表示这个人具有特征短发1运动鞋3
double [] dTest = {0,0,0,0,1,0};
Vector vTest = Vectors.dense(dTest);//测试对象为单个vector或者是RDD化后的vector
//朴素贝叶斯用法
int modelIndex =(int) nb_model.predict(vTest);
System.out.println("标签分类编号:"+modelIndex);// 分类结果 == 返回分类的标签值
/**
*
*/
System.out.println(nb_model.predictProbabilities(vTest));
if(modelIndex == 1){
System.out.println("答案:贝叶斯分类器推断这个人的性别是男性");
}else if(modelIndex == 2){
System.out.println("答案:贝叶斯分类器推断这个人的性别是女性");
}
//最后不要忘了释放资源
sc.close();
}
}

@ -0,0 +1,126 @@
import java.util.ArrayList;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.mllib.classification.NaiveBayes;
import org.apache.spark.mllib.classification.NaiveBayesModel;
import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.mllib.linalg.Vectors;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.junit.Test;
import com.appleyk.process.ModelProcess;
import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.dictionary.CustomDictionary;
import com.hankcs.hanlp.seg.Segment;
import com.hankcs.hanlp.seg.common.Term;
public class HanLPTest {
@Test
public void TestA(){
String lineStr = "明天虽然会下雨,但是我还是会看周杰伦的演唱会。";
try{
Segment segment = HanLP.newSegment();
segment.enableCustomDictionary(true);
/**
* +
*/
CustomDictionary.add("虽然会","ng 0");
List<Term> seg = segment.seg(lineStr);
for (Term term : seg) {
System.out.println(term.toString());
}
}catch(Exception ex){
System.out.println(ex.getClass()+","+ex.getMessage());
}
}
@Test
public void TestC() throws Exception{
ModelProcess query = new ModelProcess("D:/HanLP/data");
String[] questionArr = new String[] {"卧虎藏龙的分数是多少"};
for(String que: questionArr){
ArrayList<String> question = query.analyQuery(que);
System.err.println(question);
}
}
@Test
public void TestRDD(){
SparkConf conf = new SparkConf().setAppName("NaiveBayesTest").setMaster("local[*]");
JavaSparkContext sc = new JavaSparkContext(conf);
/**
* MLlibDenseVectorSparseVector
*
*/
/**
* ==
* (1.0, 0.0, 2.0
* (2.0, 3.0, 0.0
*/
//稠密向量 == 连续的
Vector dense = Vectors.dense(1.0,0.0,2.0);
System.out.println(dense);
//稀疏向量 == 间隔的、指定的,未指定位置的向量值默认 = 0.0
int len = 3;
int[] index = new int[]{0,1};
double[] values = new double[]{2.0,3.0};
Vector sparse = Vectors.sparse(len, index, values);
/**
* labeled point
* label/response
* MLliblabeled points
* 使doublelabel使labeled points
* label 0 1
* labelsclass00,1,2,......
*/
//训练集生成 规定数据结构为LabeledPoint == 构建方式:稠密向量模式 1.0:类别编号
LabeledPoint train_one = new LabeledPoint(1.0,dense); //(1.0, 0.0, 2.0
//训练集生成 规定数据结构为LabeledPoint == 构建方式:稀疏向量模式 2.0:类别编号
LabeledPoint train_two = new LabeledPoint(2.0,sparse); //(2.0, 3.0, 0.0
//训练集生成 规定数据结构为LabeledPoint == 构建方式:稠密向量模式 3.0:类别编号
LabeledPoint train_three = new LabeledPoint(3.0,Vectors.dense(1,1,2)); //(1.0, 1.0, 2.0
//List存放训练集【三个训练样本数据】
List<LabeledPoint> trains = new ArrayList<>();
trains.add(train_one);
trains.add(train_two);
trains.add(train_three);
//获得弹性分布式数据集JavaRDD数据类型为LabeledPoint
JavaRDD<LabeledPoint> trainingRDD = sc.parallelize(trains);
/**
* SparkRDD
* JavaRDDSparkRDD
*/
NaiveBayesModel nb_model = NaiveBayes.train(trainingRDD.rdd());
//测试集生成
double [] dTest = {2,1,0};
Vector vTest = Vectors.dense(dTest);//测试对象为单个vector或者是RDD化后的vector
//朴素贝叶斯用法
System.err.println(nb_model.predict(vTest));// 分类结果 == 返回分类的标签值
/**
*
*/
System.err.println(nb_model.predictProbabilities(vTest));
//最后不要忘了释放资源
sc.close();
}
}
Loading…
Cancel
Save