欧美性猛交XXXX免费看蜜桃,成人网18免费韩国,亚洲国产成人精品区综合,欧美日韩一区二区三区高清不卡,亚洲综合一区二区精品久久

打開(kāi)APP
userphoto
未登錄

開(kāi)通VIP,暢享免費電子書(shū)等14項超值服

開(kāi)通VIP
測試lucene的所有分詞接口

測試lucene的所有分詞接口(原創(chuàng ))- -

TagLucene    中文    分詞                                          

Lucene本身提供了幾個(gè)分詞接口,我后來(lái)有給寫(xiě)了一個(gè)分詞接口.

功能遞增如下:

WhitespaceAnalyzer:僅僅是去除空格,對字符沒(méi)有lowcase化,不支持中文

SimpleAnalyzer:功能強于WhitespaceAnalyzer,將除去letter之外的符號全部過(guò)濾掉,并且將所有的字符lowcase化,不支持中文

StopAnalyzer:StopAnalyzer的功能超越了SimpleAnalyzer,在SimpleAnalyzer的基礎上
    增加了去除StopWords的功能,不支持中文

StandardAnalyzer:英文的處理能力同于StopAnalyzer.支持中文采用的方法為單字切分.

ChineseAnalyzer:來(lái)自于Lucene的sand box.性能類(lèi)似于StandardAnalyzer,缺點(diǎn)是不支持中英文混和分詞.

CJKAnalyzer:chedong寫(xiě)的CJKAnalyzer的功能在英文處理上的功能和StandardAnalyzer相同
    但是在漢語(yǔ)的分詞上,不能過(guò)濾掉標點(diǎn)符號,即使用二元切分

TjuChineseAnalyzer:我寫(xiě)的,功能最為強大.TjuChineseAnlyzer的功能相當強大,在中文分詞方面由于其調用的為ICTCLAS的java接口.所以其在中文方面性能上同與ICTCLAS.其在英文分詞上采用了Lucene的StopAnalyzer,可以去除 stopWords,而且可以不區分大小寫(xiě),過(guò)濾掉各類(lèi)標點(diǎn)符號.

程序調試于:JBuilder 2005

package org.apache.lucene.analysis;

//Author:zhangbufeng
//TjuAILab(天津大學(xué)人工智能實(shí)驗室)
//2005.9.22.11:00


import java.io.*;
import junit.framework.*;

import org.apache.lucene.*;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.standard.*;
import org.apache.lucene.analysis.cn.*;
import org.apache.lucene.analysis.cjk.*;
import org.apache.lucene.analysis.tjucn.*;
import com.xjt.nlp.word.*;
public class TestAnalyzers extends TestCase {

   public TestAnalyzers(String name) {
      super(name);
   }

  public void assertAnalyzesTo(Analyzer a,
                               String input,
                               String[] output) throws Exception {
    //前面的"dummy"好像沒(méi)有用到
    TokenStream ts = a.tokenStream("dummy", new StringReader(input));
    StringReader readerInput=new StringReader(input);
    for (int i=0; i      Token t = ts.next();
      //System.out.println(t);
      assertNotNull(t);
      //使用下面這條語(yǔ)句即可以輸出Token的每項的text,并且用空格分開(kāi)
      System.out.print(t.termText);
      System.out.print(" ");
      assertEquals(t.termText(), output[i]);
    }
    System.out.println(" ");
    assertNull(ts.next());
    ts.close();
  }
 public void outputAnalyzer(Analyzer a ,String input) throws Exception{
   TokenStream ts = a.tokenStream("dummy",new StringReader(input));
   StringReader readerInput = new StringReader(input);
   while(true){
     Token t = ts.next();
     if(t!=null){
       System.out.print(t.termText);
       System.out.print(" ");
     }
     else
     break;

   }
 System.out.println(" ");
 ts.close();
 }

  public void testSimpleAnalyzer() throws Exception {
    //學(xué)習使用SimpleAnalyzer();
    //SimpleAnalyzer將除去letter之外的符號全部過(guò)濾掉,并且將所有的字符lowcase化
    Analyzer a = new SimpleAnalyzer();
    assertAnalyzesTo(a, "foo bar FOO BAR",
                     new String[] { "foo", "bar", "foo", "bar" });
    assertAnalyzesTo(a, "foo      bar .  FOO <> BAR",
                     new String[] { "foo", "bar", "foo", "bar" });
    assertAnalyzesTo(a, "foo.bar.FOO.BAR",
                     new String[] { "foo", "bar", "foo", "bar" });
    assertAnalyzesTo(a, "U.S.A.",
                     new String[] { "u", "s", "a" });
    assertAnalyzesTo(a, "C++",
                     new String[] { "c" });
    assertAnalyzesTo(a, "B2B",
                     new String[] { "b", "b" });
    assertAnalyzesTo(a, "2B",
                     new String[] { "b" });
    assertAnalyzesTo(a, "\"QUOTED\" word",
                     new String[] { "quoted", "word" });
    assertAnalyzesTo(a,"zhang ./ bu <> feng",
                     new String[]{"zhang","bu","feng"});
    ICTCLAS splitWord = new ICTCLAS();
    String result = splitWord.paragraphProcess("我愛(ài)共產(chǎn)黨 i LOVE chanchan");
    assertAnalyzesTo(a,result,
                     new String[]{"我","愛(ài)","共產(chǎn)黨","i","love","chanchan"});

  }

  public void testWhiteSpaceAnalyzer() throws Exception {
    //WhiterspaceAnalyzer僅僅是去除空格,對字符沒(méi)有lowcase化
    Analyzer a = new WhitespaceAnalyzer();
    assertAnalyzesTo(a, "foo bar FOO BAR",
                     new String[] { "foo", "bar", "FOO", "BAR" });
    assertAnalyzesTo(a, "foo      bar .  FOO <> BAR",
                     new String[] { "foo", "bar", ".", "FOO", "<>", "BAR" });
    assertAnalyzesTo(a, "foo.bar.FOO.BAR",
                     new String[] { "foo.bar.FOO.BAR" });
    assertAnalyzesTo(a, "U.S.A.",
                     new String[] { "U.S.A." });
    assertAnalyzesTo(a, "C++",
                     new String[] { "C++" });

    assertAnalyzesTo(a, "B2B",
                     new String[] { "B2B" });
    assertAnalyzesTo(a, "2B",
                     new String[] { "2B" });
    assertAnalyzesTo(a, "\"QUOTED\" word",
                     new String[] { "\"QUOTED\"", "word" });

    assertAnalyzesTo(a,"zhang bu feng",
                     new String []{"zhang","bu","feng"});
    ICTCLAS splitWord = new ICTCLAS();
    String result = splitWord.paragraphProcess("我愛(ài)共產(chǎn)黨 i love chanchan");
    assertAnalyzesTo(a,result,
                     new String[]{"我","愛(ài)","共產(chǎn)黨","i","love","chanchan"});
  }

  public void testStopAnalyzer() throws Exception {
    //StopAnalyzer的功能超越了SimpleAnalyzer,在SimpleAnalyzer的基礎上
    //增加了去除StopWords的功能
   Analyzer a = new StopAnalyzer();
    assertAnalyzesTo(a, "foo bar FOO BAR",
                     new String[] { "foo", "bar", "foo", "bar" });
    assertAnalyzesTo(a, "foo a bar such FOO THESE BAR",
                     new String[] { "foo", "bar", "foo", "bar" });
    assertAnalyzesTo(a,"foo ./ a bar such ,./<> FOO THESE BAR ",
                     new String[]{"foo","bar","foo","bar"});
    ICTCLAS splitWord = new ICTCLAS();
    String result = splitWord.paragraphProcess("我愛(ài)共產(chǎn)黨 i Love chanchan such");
    assertAnalyzesTo(a,result,
                     new String[]{"我","愛(ài)","共產(chǎn)黨","i","love","chanchan"});

  }
  public void testStandardAnalyzer() throws Exception{
  //StandardAnalyzer的功能最為強大,對于中文采用的為單字切分
  Analyzer a = new StandardAnalyzer();
  assertAnalyzesTo(a,"foo bar Foo Bar",
                   new String[]{"foo","bar","foo","bar"});
  assertAnalyzesTo(a,"foo bar ./ Foo ./ BAR",
                   new String[]{"foo","bar","foo","bar"});
  assertAnalyzesTo(a,"foo ./ a bar such ,./<> FOO THESE BAR ",
                     new String[]{"foo","bar","foo","bar"});
  assertAnalyzesTo(a,"張步峰是天大學(xué)生",
                   new String[]{"張","步","峰","是","天","大","學(xué)","生"});
  //驗證去除英文的標點(diǎn)符號
  assertAnalyzesTo(a,"張,/步/,峰,.是.,天大<>學(xué)生",
                   new String[]{"張","步","峰","是","天","大","學(xué)","生"});
  //驗證去除中文的標點(diǎn)符號
  assertAnalyzesTo(a,"張。、步。、峰是。天大。學(xué)生",
                   new String[]{"張","步","峰","是","天","大","學(xué)","生"});
  }
  public void testChineseAnalyzer() throws Exception{
  //可見(jiàn)ChineseAnalyzer在功能上和standardAnalyzer的功能差不多,但是可能在速度上慢于StandardAnalyzer
  Analyzer a = new ChineseAnalyzer();

  //去空格
  assertAnalyzesTo(a,"foo bar Foo Bar",
                    new String[]{"foo","bar","foo","bar"});
   assertAnalyzesTo(a,"foo bar ./ Foo ./ BAR",
                    new String[]{"foo","bar","foo","bar"});
   assertAnalyzesTo(a,"foo ./ a bar such ,./<> FOO THESE BAR ",
                      new String[]{"foo","bar","foo","bar"});
   assertAnalyzesTo(a,"張步峰是天大學(xué)生",
                    new String[]{"張","步","峰","是","天","大","學(xué)","生"});
   //驗證去除英文的標點(diǎn)符號
   assertAnalyzesTo(a,"張,/步/,峰,.是.,天大<>學(xué)生",
                    new String[]{"張","步","峰","是","天","大","學(xué)","生"});
   //驗證去除中文的標點(diǎn)符號
   assertAnalyzesTo(a,"張。、步。、峰是。天大。學(xué)生",
                    new String[]{"張","步","峰","是","天","大","學(xué)","生"});
   //不支持中英文寫(xiě)在一起
  // assertAnalyzesTo(a,"我愛(ài)你 i love chanchan",
  ///                  new String[]{"我","愛(ài)","你","i","love","chanchan"});

  }
  public void testCJKAnalyzer() throws Exception {
    //chedong寫(xiě)的CJKAnalyzer的功能在英文處理上的功能和StandardAnalyzer相同
    //但是在漢語(yǔ)的分詞上,不能過(guò)濾掉標點(diǎn)符號,即使用二元切分
    Analyzer a = new CJKAnalyzer();
    assertAnalyzesTo(a,"foo bar Foo Bar",
                    new String[]{"foo","bar","foo","bar"});
    assertAnalyzesTo(a,"foo bar ./ Foo ./ BAR",
                  new String[]{"foo","bar","foo","bar"});
    assertAnalyzesTo(a,"foo ./ a bar such ,./<> FOO THESE BAR ",
                    new String[]{"foo","bar","foo","bar"});

   // assertAnalyzesTo(a,"張,/步/,峰,.是.,天大<>學(xué)生",
     //                new String[]{"張步","步峰","峰是","是天","天大","大學(xué)","學(xué)生"});
    //assertAnalyzesTo(a,"張。、步。、峰是。天大。學(xué)生",
     //                new String[]{"張步","步峰","峰是","是天","天大","大學(xué)","學(xué)生"});
   //支持中英文同時(shí)寫(xiě)
    assertAnalyzesTo(a,"張步峰是天大學(xué)生 i love",
                     new String[]{"張步","步峰","峰是","是天","天大","大學(xué)","學(xué)生","i","love"});

  }
  public void testTjuChineseAnalyzer() throws Exception{
      /**
       * TjuChineseAnlyzer的功能相當強大,在中文分詞方面由于其調用的為ICTCLAS的java接口.
       * 所以其在中文方面性能上同與ICTCLAS.其在英文分詞上采用了Lucene的StopAnalyzer,可以去除
       * stopWords,而且可以不區分大小寫(xiě),過(guò)濾掉各類(lèi)標點(diǎn)符號.
       */
      Analyzer a = new TjuChineseAnalyzer();
      String input = "體育訊 在被尤文淘汰之后,皇馬主帥博斯克拒絕接受媒體對球隊后防線(xiàn)的批評,同時(shí)還為自己排出的首發(fā)陣容進(jìn)行了辯護。"+
          "“失利是全隊的責任,而不僅僅是后防線(xiàn)該受指責,”博斯克說(shuō),“我并不認為我們踢得一塌糊涂。”“我們進(jìn)入了半決賽,而且在晉級的道路上一路奮 "+
         "戰。即使是今天的比賽我們也有幾個(gè)翻身的機會(huì ),但我們面對的對手非常強大,他們踢得非常好。”“我們的球迷應該為過(guò)去幾個(gè)賽季里我們在冠軍杯中的表現感到驕傲。”"+
         "博斯克還說(shuō)。對于博斯克在首發(fā)中排出了久疏戰陣的坎比亞索,賽后有記者提出了質(zhì)疑,認為完全應該將隊內的另一 "+
         "名球員帕文派遣上場(chǎng)以加強后衛線(xiàn)。對于這一疑議,博斯克拒絕承擔所謂的“責任”,認為球隊的首發(fā)沒(méi)有問(wèn)題。“我們按照整個(gè)賽季以來(lái)的方式做了,"+
         "對于人員上的變化我沒(méi)有什么可說(shuō)的。”對于球隊在本賽季的前景,博斯克表示皇馬還有西甲聯(lián)賽的冠軍作為目標。“皇家馬德里在冠軍 "+
        "杯中戰斗到了最后,我們在聯(lián)賽中也將這么做。”"+
        "A Java User Group is a group of people who share a common interest in Java technology and meet on a regular basis to share"+
       " technical ideas and information. The actual structure of a JUG can vary greatly - from a small number of friends and coworkers"+
      " meeting informally in the evening, to a large group of companies based in the same geographic area. "+
      "Regardless of the size and focus of a particular JUG, the sense of community spirit remains the same. ";

      outputAnalyzer(a,input);
    //此處我已經(jīng)對大文本進(jìn)行過(guò)測試,不會(huì )有問(wèn)題效果很好
    outputAnalyzer(a,"我愛(ài)共產(chǎn)黨 ,,。 I love China 我喜歡唱歌 ");
    assertAnalyzesTo(a,"我愛(ài)共產(chǎn)黨 ,,。I love China 我喜歡唱歌",
                   new String[]{"愛(ài)","共產(chǎn)黨","i","love","china","喜歡","唱歌"});
  }
}

本站僅提供存儲服務(wù),所有內容均由用戶(hù)發(fā)布,如發(fā)現有害或侵權內容,請點(diǎn)擊舉報。
打開(kāi)APP,閱讀全文并永久保存 查看更多類(lèi)似文章
猜你喜歡
類(lèi)似文章
網(wǎng)絡(luò )蜘蛛,搜索引擎/網(wǎng)絡(luò )蜘蛛程序代碼 瘋狂代碼!
Lucene常用的Analyzer功能概述以及自定義Analyzer
給Lucene加入性能更好的中文分詞
讓中科院中文分詞系統ICTCLAS為lucene所用的簡(jiǎn)單程序(C#版)
Lucene關(guān)于幾種中文分詞的總結 (Lucene與搜索引擎技術(shù)) - [Matrix - 與 Java 共舞]
構建基于詞典的Lucene分析器
更多類(lèi)似文章 >>
生活服務(wù)
分享 收藏 導長(cháng)圖 關(guān)注 下載文章
綁定賬號成功
后續可登錄賬號暢享VIP特權!
如果VIP功能使用有故障,
可點(diǎn)擊這里聯(lián)系客服!

聯(lián)系客服

欧美性猛交XXXX免费看蜜桃,成人网18免费韩国,亚洲国产成人精品区综合,欧美日韩一区二区三区高清不卡,亚洲综合一区二区精品久久