日语分词方法

1. Mecab (github)

# 安装mecab
!brew install mecab
!brew install mecab-ipadic
!pip install mecab-python3

import MeCab
mecab = MeCab.Tagger ("-Owakati")
parsed = mecab.parse('麩菓子は、麩を主材料とした日本の菓子。').split()
print(parsed)

2. Fugashi (github)

# 安装fugashi 
pip install fugashi 
# 安装字典 
pip install ipadic


import fugashi
import ipadic
import os

mecab_dic = "ipadic"
dic_dir = ipadic.DICDIR
mecabrc = os.path.join(dic_dir, "mecabrc")
mecab_option = f'-d "{dic_dir}" -r "{mecabrc}"'
mecab = fugashi.GenericTagger(mecab_option)
print(mecab('麩菓子は、麩を主材料とした日本の菓子。'))
0