利用文本挖掘技术来找出网络中的“小鲜词”
发布时间:2021-01-17 22:16:03 所属栏目:大数据 来源:网络整理
导读:副标题#e# 开始之前,先看一下从人人网中发现的90后用户爱用的词 是不是很好玩,哈哈。写这篇文章就是让你简单的自动的从文本中找出新的词,这样就知道现在的年轻人喜欢什么了(对于博主这种上了年纪的人来说,真的是很有用,呜呜) 项目结构 当然,text.da
|
分词处理,具体看实现 Chunk.javapackage grid.text.participle;
import grid.text.dic.CnDictionary;
import java.util.List;
public class Chunk implements Comparable<Chunk> {
private List<String> list;
private int len = 0;
private double avg = 0;
private double variance = 0;
public Chunk(List<String> list) {
this.list = list;
init();
}
private void init() {
for (String s : list) {
len += s.length();
}
avg = (double) len / list.size();
for (String s : list) {
variance += Math.pow(avg - s.length(),2);
}
variance = Math.sqrt(variance);
}
public int getLen() {
return len;
}
public double getAvg() {
return avg;
}
public double getVariance() {
return variance;
}
public String getHead() {
if (null == list || list.isEmpty()) {
return "";
}
return list.get(0);
}
private int compareDouble(double d1,double d2) {
if (d1 - d2 < -0.0000001D) {
return 1;
} else if (d1 - d2 > 0.0000001D) {
return -1;
}
return 0;
}
@Override
public int compareTo(Chunk o) {
if (len != o.len) {
return o.len - len;
}
int d = compareDouble(avg,o.avg);
if (0 != d) {
return d;
}
d = compareDouble(variance,o.variance);
if (0 != d) {
return d;
}
CnDictionary dictionary = CnDictionary.Instance();
double rateSrc = 0,rateDest = 0;
for (String s : list) {
if (1 == s.length()) {
rateSrc += dictionary.rate(s.charAt(0));
}
}
for (String s : o.list) {
if (1 == s.length()) {
rateDest += dictionary.rate(s.charAt(0));
}
}
return compareDouble(rateSrc,rateDest);
}
public String toString() {
return list.toString();
}
}
ChunkStream.javapackage grid.text.participle;
import grid.common.Node;
import grid.common.TextUtils;
import grid.common.Tree;
import grid.text.dic.CnDictionary;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
public class ChunkStream {
/** * Define the max supposed word length * * You could shorten the value if you don't need too long participle result */
private static final int MAX_WORD_LEN = 7;
/** * Define the predict level while execute participle. * * Negligible accuracy will be promoted if you increase this value */
private static final int PREDICT_LEVEL = 3;
private static CnDictionary dictionary = CnDictionary.Instance();
public String next(String text,int off) {
Tree<String> root = new Tree<String>("ROOT");
recurse(root,off,text,0);
List<Node<String>> list = root.getLeaves();
List<Chunk> chunkList = new ArrayList<Chunk>();
for (Node<String> node : list) {
chunkList.add(new Chunk(node.getBranchPath()));
}
Collections.sort(chunkList);
return chunkList.get(0).getHead();
}
private void recurse(Node<String> node,String text,int predictDeep) {
int len = MAX_WORD_LEN + off > text.length() ? text.length() - off
: MAX_WORD_LEN;
while (predictDeep < PREDICT_LEVEL) {
if (len < 1) {
return;
}
String s = text.substring(off,off + len);
if (len < 2) {
if (!TextUtils.isCnLetter(text.charAt(off))) {
break;
}
recurse(node.add(s),off + 1,predictDeep + 1);
} else if (dictionary.contains(s)) {
recurse(node.add(s),off + s.length(),predictDeep + 1);
}
len--;
}
}
}
MechanicalParticiple.javapackage grid.text.participle;
import grid.common.TextUtils;
import java.util.Vector;
public class MechanicalParticiple {
public Vector<String> partition(String document) {
Vector<String> vector = new Vector<String>();
final int docLen = document.length();
int off = 0;
char c;
String seg = "";
ChunkStream stream = new ChunkStream();
while (off < docLen) {
c = document.charAt(off);
if (TextUtils.isEnLetter(c) || TextUtils.isNumeric(c)) {
seg += c;
off++;
} else if (TextUtils.isCnLetter(c)) {
if (!TextUtils.isBlank(seg)) {
vector.add(seg);
seg = "";
}
String word = stream.next(document,off);
if (!TextUtils.isBlank(word)) {
vector.add(word);
off += word.length();
}
} else {
if (!TextUtils.isBlank(seg)) {
vector.add(seg);
seg = "";
}
/** * TODO: Uncomment the "ELSE IF" clause if you would like to * reserve punctuations */
// else if (!TextUtils.isBlank("" + c)) { vector.add("" + c); }
off++;
}
}
if (!TextUtils.isBlank(seg)) {
vector.add(seg);
}
return vector;
}
}
selector(编辑:应用网_阳江站长网) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |


