欢迎来到天天文库
浏览记录
ID:56825819
大小:31.00 KB
页数:7页
时间:2020-07-15
《java中文分词代码.doc》由会员上传分享,免费在线阅读,更多相关内容在教育资源-天天文库。
1、/* *createdbyyzh2004.5.12 *请大家引用时保留这段作者声明,此代码为开源代码;使用不受限制,欢迎大家采用本人所写JS动态拖动表格实现代码。 *中文分词代码 *此代码为作者多年经验总结,以前发表过VB,PB版本*/importjava.io.BufferedReader;importjava.io.IOException;importjava.io.InputStream;importjava.io.InputStreamReader;importjava.util.Locale;importjava.util.TreeMap;importjava.util.TreeSe
2、t;publicclassChineseSegmenter{ privatestaticChineseSegmentersegmenter=null; //privateHashtablezhwords; privateTreeMapzhwords; privateTreeSetcforeign,cnumbers; //Charform publicfinalstaticintTRAD=0; publicfinalstaticintSIMP=1; publicfinalstaticintBOTH=2; //CharformisTRAD,SIMPorBOTH privateC
3、hineseSegmenter(intcharform,booleanloadwordfile){ cforeign=newTreeSet(); cnumbers=newTreeSet(); if(charform==SIMP){ loadset(cnumbers,"data/snumbers_u8.txt"); loadset(cforeign,"data/sforeign_u8.txt"); }elseif(charform==TRAD){ loadset(cnumbers,"data/tnumbers_u
4、8.txt"); loadset(cforeign,"data/tforeign_u8.txt"); }else{//BOTH loadset(cnumbers,"data/snumbers_u8.txt"); loadset(cforeign,"data/sforeign_u8.txt"); loadset(cnumbers,"data/tnumbers_u8.txt"); loadset(cforeign,"data/tforeign_u8.txt"); } //zhwords=newHashta
5、ble(); zhwords=newTreeMap(); if(!loadwordfile){ return; } Stringnewword=null; try{ InputStreamworddata=null; if(charform==SIMP){ worddata=getClass().getResourceAsStream("simplexu8.txt"); }elseif(charform==TRAD){ worddata=getClass().
6、getResourceAsStream("tradlexu8.txt"); }elseif(charform==BOTH){ worddata=getClass().getResourceAsStream("bothlexu8.txt"); } BufferedReaderin=newBufferedReader(newInputStreamReader( worddata,"UTF8")); while((newword=in.readLine())!=null){ if
7、((newword.indexOf("#")==-1)&&(newword.length()<5)){ zhwords.put(newword.intern(),"1"); if(newword.length()==3){ if(zhwords.containsKey(newword.substring(0,2)
此文档下载收益归作者所有