Skip to content

Commit

Permalink
Using new API lazyTokenizeSentences from Sudachi v0.7.4
Browse files Browse the repository at this point in the history
  • Loading branch information
azagniotov authored and Alexander Zagniotov committed Aug 3, 2024
1 parent 9d2edbe commit 2dd7bec
Show file tree
Hide file tree
Showing 7 changed files with 9,507 additions and 117 deletions.
2 changes: 1 addition & 1 deletion conf/gradle/sudachi.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ plugins.withType(JavaPlugin) {
targetDir = file("/tmp/sudachi")
dictionaryName = "sudachi-dictionary-${sudachiDictionaryVersion}-${sudachiDictionaryType}"
downloadDestination = "${rootDir}/.sudachi/downloaded/${dictionaryName}.zip"
dictChecksum = "cbf35332df534fb4dfb5eb57d04c894e"
dictChecksum = "28484266ae6231d27dc745ff27adc459"
}

task deleteDictionaryData() {
Expand Down
4 changes: 2 additions & 2 deletions gradle.properties
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@ solrLuceneAnalyzerSudachiProjectName=solr-lucene-analyzer-sudachi
solrLuceneAnalyzerSudachiProjectGroup=io.github.azagniotov
solrLuceneAnalyzerSudachiProjectVersion=1.0.0-SNAPSHOT

sudachiVersion=0.7.3
sudachiVersion=0.7.4
sudachiDictionaryType=full
sudachiDictionaryVersion=20240409
sudachiDictionaryVersion=20240716

solrVersion=9.4.0
luceneVersion=9.8.0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,21 +44,31 @@

public class SudachiAnalyzerTest extends BaseTokenStreamTestCase {

private Analyzer analyzer;
private Analyzer defaultAnalyzer;

@Override
public void setUp() throws Exception {
super.setUp();
analyzer = new SudachiAnalyzer(
defaultAnalyzer = new SudachiAnalyzer(
SudachiAnalyzer.getDefaultStopSet(), SudachiAnalyzer.getDefaultStopTags(), true, "search");
}

@Override
public void tearDown() throws Exception {
analyzer.close();
defaultAnalyzer.close();
super.tearDown();
}

@Test
public void testAnalyzeLargeText_9MB() throws Exception {
final InputStream textInputStream =
this.getClass().getResourceAsStream("/9mb.japanese.history.large.content.txt");
final String japanese = new Scanner(textInputStream).useDelimiter("\\A").next();

final TokenStream tokenStream = defaultAnalyzer.tokenStream("any", japanese);
assertNotNull(tokenStream);
}

@Test
public void testLargeTextLoadTestWithUnfilteredStopWords() throws Exception {
final Analyzer analyzer = new Analyzer() {
Expand Down Expand Up @@ -213,7 +223,7 @@ public void testRepeatedHiraganaWord() throws Exception {

final List<String> nCopies = Collections.nCopies(limit, hiraganaWord);

assertAnalyzesTo(analyzer, sb.toString(), nCopies.toArray(new String[0]));
assertAnalyzesTo(defaultAnalyzer, sb.toString(), nCopies.toArray(new String[0]));
}

@Test
Expand All @@ -228,7 +238,7 @@ public void testRepeatedKatakanaWord() throws Exception {
sb.append(new String(new char[limit]).replace("\0", katakanaWord));

final List<String> nCopies = Collections.nCopies(limit, katakanaWord);
assertAnalyzesTo(analyzer, sb.toString(), nCopies.toArray(new String[0]));
assertAnalyzesTo(defaultAnalyzer, sb.toString(), nCopies.toArray(new String[0]));
}

@Test
Expand All @@ -240,7 +250,7 @@ public void testRepeatedKanjiWord() throws Exception {
sb.append(new String(new char[limit]).replace("\0", kanjiWord));

final List<String> nCopies = Collections.nCopies(limit, kanjiWord);
assertAnalyzesTo(analyzer, sb.toString(), nCopies.toArray(new String[0]));
assertAnalyzesTo(defaultAnalyzer, sb.toString(), nCopies.toArray(new String[0]));
}

@Test
Expand All @@ -255,14 +265,15 @@ public void testDecomposition() throws IOException {
//

// 'Full' dictionary by Sudachi does not split this properly to すもも and もも
assertAnalyzesTo(analyzer, "すもももももももものうち。", new String[] {"すもももももも", "もも"});
assertAnalyzesTo(analyzer, "エーービ〜〜〜シ〰〰〰〰", new String[] {"エービーシ"});
assertAnalyzesTo(analyzer, "シュミレーション", new String[] {"シュミレーション"});
assertAnalyzesTo(analyzer, "ちゃあ", new String[] {}); // Result ちゃあ => だ got filtered out due to stopwords.txt
assertAnalyzesTo(analyzer, "打ち込む", new String[] {"打つ", "込む"});
assertAnalyzesTo(defaultAnalyzer, "すもももももももものうち。", new String[] {"すもももももも", "もも"});
assertAnalyzesTo(defaultAnalyzer, "エーービ〜〜〜シ〰〰〰〰", new String[] {"エービーシ"});
assertAnalyzesTo(defaultAnalyzer, "シュミレーション", new String[] {"シュミレーション"});
assertAnalyzesTo(
defaultAnalyzer, "ちゃあ", new String[] {}); // Result ちゃあ => だ got filtered out due to stopwords.txt
assertAnalyzesTo(defaultAnalyzer, "打ち込む", new String[] {"打つ", "込む"});

assertAnalyzesTo(
analyzer,
defaultAnalyzer,
"The quick 客室乗務員 brown FOXes jumps over the lazy dogs and computers 医薬品安全管理責任者",
new String[] {
"the",
Expand All @@ -287,27 +298,30 @@ public void testDecomposition() throws IOException {
"者"
});

assertAnalyzesTo(analyzer, "清水寺は東京都にあります。", new String[] {"清水寺", "東京", "都"});
assertAnalyzesTo(defaultAnalyzer, "清水寺は東京都にあります。", new String[] {"清水寺", "東京", "都"});

assertAnalyzesTo(analyzer, "メガネは顔の一部です。", new String[] {"メガネ", "顔", "一部"});
assertAnalyzesTo(defaultAnalyzer, "メガネは顔の一部です。", new String[] {"メガネ", "顔", "一部"});

assertAnalyzesTo(analyzer, "日本経済新聞でモバゲーの記事を読んだ。", new String[] {"日本", "経済", "新聞", "モバゲ", "記事", "読む"});
assertAnalyzesTo(defaultAnalyzer, "日本経済新聞でモバゲーの記事を読んだ。", new String[] {"日本", "経済", "新聞", "モバゲ", "記事", "読む"});

assertAnalyzesTo(analyzer, "Java, Scala, Groovy, Clojure", new String[] {"java", "scala", "groovy", "clojure"});
assertAnalyzesTo(
defaultAnalyzer, "Java, Scala, Groovy, Clojure", new String[] {"java", "scala", "groovy", "clojure"});

assertAnalyzesTo(analyzer, "LUCENE、SOLR、Lucene, Solr", new String[] {"lucene", "solr", "lucene", "solr"});
assertAnalyzesTo(
defaultAnalyzer, "LUCENE、SOLR、Lucene, Solr", new String[] {"lucene", "solr", "lucene", "solr"});

// Need an entry in user dictionary to fix: さしすせそ (the すせ is missing in the result)
assertAnalyzesTo(
analyzer, "アイウエオカキクケコさしすせそABCXYZ123456", new String[] {"アイウエオカキクケコ", "さし", "そ", "abcxyz", "123456"});
assertAnalyzesTo(defaultAnalyzer, "アイウエオカキクケコさしすせそABCXYZ123456", new String[] {
"アイウエオカキクケコ", "さし", "そ", "abcxyz", "123456"
});

// The "たろう" is removed by the Sudachi Analyzer because of:
// 1. BaseForm filter:
// たろう => だ; and
// 2. SudachiPartOfSpeechStopFilter:
// the auxiliary verb (助動詞) it is uncommented in the stoptags.txt,
// thus the token is removed from the token stream.
assertAnalyzesTo(analyzer, "ももたろう", new String[] {"もも"});
assertAnalyzesTo(defaultAnalyzer, "ももたろう", new String[] {"もも"});
}

private Tokenizer createTokenizer(final Map<String, String> args) throws IOException {
Expand Down
Loading

0 comments on commit 2dd7bec

Please sign in to comment.