正文
* 环境准备 (Windows或Linux版本都行):
R下载:http://mirrors.ustc.edu.cn/CRAN/
Rwordseg包下载:https://r-forge.r-project.org/R/?group_id=1054
rJava包下载:http://cran.r-project.org/web/packages/rJava/index.html
Rwordseg和rJava这些包解压后放到\R\R-3.1.0\library即可
* R语言 实现 代码
-
# 加载rJava、Rwordseg库
-
library(rJava);
-
library(Rwordseg);
-
-
# == 读入数据
-
lecture=read.csv("E:\\worldcup_test.txt",sep=",",header=TRUE,fileEncoding="UTF-8");
-
# 查看前几行,看是否有字符编码问题
-
head(lecture);
-
# 获取数据集长度
-
n=length(lecture[,1]);
-
print(n)
-
-
# == 文本预处理
-
res=lecture[lecture!=" "];
-
#剔除URL
-
res=gsub(pattern="http:[a-zA-Z\\/\\.0-9]+","",res);
-
#剔除特殊词
-
res=gsub(pattern="[我|你|的|了|是]","",res);
-
-
# == 分词+频数统计
-
words=unlist(lapply(X=res, FUN=segmentCN));
-
word=lapply(X=words, FUN=strsplit, " ");
-
v=table(unlist(word));
-
# 降序排序
-
v=rev(sort(v));
-
d=data.frame(word=names(v), freq=v);
-
# 过滤掉1个字和词频小于100的记录
-
d=subset(d, nchar(as.character(d
word))>1 & d
" role="presentation" style=" display: inline-block; line-height: 0; font-size: 20.32px; word-wrap: normal; word-spacing: normal; float: none; direction: ltr; max-width: none; max-height: none; min-width: 0px; min-height: 0px; border: 0px; padding-top: 1px; padding-bottom: 1px; ">
word))>1 & d
word))>1 & d
freq>=100)
-
-
# == 输出结果