【自然语言实战】·第二章(1.1)——获取词语首字字母

一、maven依赖

        <dependency>
            <groupId>net.sourceforge.pinyin4j</groupId>
            <artifactId>pinyin4j</artifactId>
            <version>2.5.0</version>
        </dependency>

二、示例代码

import com.pingan.lcloud.ark.log.LoggerUtil;
import net.sourceforge.pinyin4j.PinyinHelper;
import org.apache.commons.lang3.CharUtils;
import org.apache.commons.lang3.StringUtils;

import java.lang.annotation.Native;
import java.util.Objects;

/**
 * <code>Details determine success.</code>
 * by Liang ZC., Phd@Stanford
 * 中文工具类
 *
 * @author LIANGZHICHENG035
 * @date 2019-11-6 15:57
 * @see http://www.stanford.edu
 */
public class ChineseUtils {
    /*
     *           N777777777NO
     *         N7777777777777N
     *        M777777777777777N
     *        *N877777777D77777M
     *       N M77777777ONND777M
     *       MN777777777NN  D777
     *     N7ZN777777777NN ~M7778
     *    N777777777777MMNN88777N
     *    N777777777777MNZZZ7777O
     *    DZN7777O77777777777777
     *     N7OONND7777777D77777N
     *      8*M++++?N???$77777$
     *       M7++++N+M77777777N
     *        N77O777777777777$                              M
     *          DNNM$$$$777777N                              D
     *         N*N:=N$777N7777M                             NZ
     *        77Z::::N777777777                          ODZZZ
     *       77N::::::N77777777M                         NNZZZ$
     *     $777:::::::77777777MN                        ZM8ZZZZZ
     *     777M::::::Z7777777Z77                        N++ZZZZNN
     *    7777M:::::M7777777$777M                       $++IZZZZM
     *   M777$:::::N777777*M7777M                       +++++ZZZDN
     *     NN$::::::7777$*M777777N                      N+++ZZZZNZ
     *       N::::::N:7*O:77777777                      N++++ZZZZN
     *       M::::::::::::N77777777+                   +?+++++ZZZM
     *       8::::::::::::D77777777M                    O+++++ZZ
     *        ::::::::::::M777777777N                      O+?D
     *        M:::::::::::M77777777778                     77=
     *        D=::::::::::N7777777777N                    777
     *       INN===::::::=77777777777N                  I777N
     *      ?777N========N7777777777787M               N7777
     *      77777*D======N77777777777N777N?         N777777
     *     I77777$$*N7===M$$77777777$77777777*MMZ77777777N
     *      $$$$$$$$$$*NIZN$$$$$$$$*M$$7777777777777777ON
     *       M$$$$$$$*M    M$$$$$$$*N=N$$$$7777777$$*ND
     *      O77Z$$$$$$$     M$$$$$$$*MNI==*DNNNNM=~N
     *   7 :N MNN$$$*M$      $$$777$8      8D8I
     *     NMM.:7O           777777778
     *                       7777777MN
     *                       M NO .7:
     *                       M   :   M
     *                            8
     */

    // Constant matcher factory methods

    public ChineseUtils() {
    }

    private static final String PUNCTUATION = "\\pP";

    /***
     * <p>get chinese initail, if the first char is number return the number, if it is a polysyllabic character,
     * take only the first one, if the chinese initail is empty return {@param defaultValue}.<p/>
     *
     * <pre>
     *  ChineseUtils.getChineseInitial("我爱中国"))	= 	W
     *  ChineseUtils.getChineseInitial("爱中国"))	= 	A
     *  ChineseUtils.getChineseInitial("1爱中国")	= 	1
     *  ChineseUtils.getChineseInitial("中国"))		= 	Z
     *  ChineseUtils.getChineseInitial("@#国"))		= 	G
     *  ChineseUtils.getChineseInitial("国%$"))		= 	G
     *  ChineseUtils.getChineseInitial("国"))		= 	G
     *  ChineseUtils.getChineseInitial("W我爱中国"))	= 	W
     *  ChineseUtils.getChineseInitial("I我爱中国"))	= 	I
     *  ChineseUtils.getChineseInitial("null"))     = 	N
     *  ChineseUtils.getChineseInitial(null))       = 	""
     *  ChineseUtils.getChineseInitial(""))         = 	""
     *  ChineseUtils.getChineseInitial(","))        =	""
     * <pre/>
     *
     * @param chinese
     * @param defaultValue
     * @return the pinyin of first chinese char,if {@param chinese} is't chinese,return {@link StringUtils.EMPTY}.
     */
    public static String getChineseInitialDefaultIfEmpty(String chinese, String defaultValue) {
        String result = getChineseInitial(chinese, true);
        return StringUtils.isEmpty(result) ? defaultValue : result;
    }

    /***
     * <p>get chinese initail, if the first char is number return the number, if it is a polysyllabic character,
     * take only the first one.<p/>
     *
     * <pre>
     *  ChineseUtils.getChineseInitial("我爱中国"))	= 	W
     *  ChineseUtils.getChineseInitial("爱中国"))	= 	A
     *  ChineseUtils.getChineseInitial("1爱中国")	= 	1
     *  ChineseUtils.getChineseInitial("中国"))		= 	Z
     *  ChineseUtils.getChineseInitial("@#国"))		= 	G
     *  ChineseUtils.getChineseInitial("国%$"))		= 	G
     *  ChineseUtils.getChineseInitial("国"))		= 	G
     *  ChineseUtils.getChineseInitial("W我爱中国"))	= 	W
     *  ChineseUtils.getChineseInitial("I我爱中国"))	= 	I
     *  ChineseUtils.getChineseInitial("null"))     = 	N
     *  ChineseUtils.getChineseInitial(null))       = 	""
     *  ChineseUtils.getChineseInitial(""))         = 	""
     *  ChineseUtils.getChineseInitial(","))        =	""
     * <pre/>
     *
     * @param chinese
     * @return the pinyin of first chinese char,if {@param chinese} is't chinese,return {@link StringUtils.EMPTY}.
     */
    public static String getChineseInitial(String chinese) {
        return getChineseInitial(chinese, true);
    }

    /***
     * <p>get chinese initail, if the first char is number return the number, if it is a polysyllabic character,
     * take only the first one.<p/>
     *
     * <pre>
     *  ChineseUtils.getChineseInitial("我爱中国")	= 	W
     *  ChineseUtils.getChineseInitial("爱中国")	    = 	A
     *  ChineseUtils.getChineseInitial("1爱中国")	= 	1
     *  ChineseUtils.getChineseInitial("中国")		= 	Z
     *  ChineseUtils.getChineseInitial("@#国")		= 	G
     *  ChineseUtils.getChineseInitial("国%$"		= 	G
     *  ChineseUtils.getChineseInitial("国")		    = 	G
     *  ChineseUtils.getChineseInitial("W我爱中国")	= 	W
     *  ChineseUtils.getChineseInitial("I我爱中国")	= 	I
     *  ChineseUtils.getChineseInitial("null")      = 	N
     *  ChineseUtils.getChineseInitial(null)        = 	""
     *  ChineseUtils.getChineseInitial("")          = 	""
     *  ChineseUtils.getChineseInitial(",")         =	""
     * <pre/>
     *
     * @param chinese
     * @param removePunctuation is remove the punctuation in {@param chinese}.
     * @return the pinyin of first chinese char,if {@param chinese} is't chinese,return {@link StringUtils.EMPTY}.
     */
    public static String getChineseInitial(String chinese, boolean removePunctuation) {
        // if need remove punctuation.
        if (removePunctuation) {
            chinese = removePunctuation(chinese);
        }
        // if chinese is blank
        if (StringUtils.isBlank(chinese)) {
            return StringUtils.EMPTY;
        }
        // first char.
        char firstChar = chinese.charAt(0);
        // if first char in [a-z,A-Z,0-9]
        if (CharUtils.isAsciiAlphanumeric(firstChar)) {
            return CharUtils.toString(firstChar).toUpperCase();
        }
        // if is chinese.
        boolean isChinese = CharUtils.toString(firstChar).matches("[\u4E00-\u9FA5]+");
        if (!isChinese) {
            return StringUtils.EMPTY;
        }
        // chinese to pinyin and get first char.
        try {
            String[] res = PinyinHelper.toHanyuPinyinStringArray(firstChar);
            return res[0].substring(0, 1).toUpperCase();
        } catch (Exception e) {
            LoggerUtil.warn("get " + chinese + " chinese initial fail.", e);
        }

        return StringUtils.EMPTY;
    }

    /**
     * <p>replace {@param str} punctuation to "", if {@param str} is empty , return {@link StringUtils.EMPTY}.</p>
     *
     * <pre>
     *  ChineseUtils.removePunctuation(null)			=	""
     *  ChineseUtils.removePunctuation("")				=	""
     *  ChineseUtils.removePunctuation(" ")				=	""
     *  ChineseUtils.removePunctuation("我爱中国")		=	我爱中国
     *  ChineseUtils.removePunctuation("我爱中国!")		=	我爱中国
     *  ChineseUtils.removePunctuation("我爱中国。")		=	我爱中国
     *  ChineseUtils.removePunctuation("我爱中国.")		=	我爱中国
     *  ChineseUtils.removePunctuation("  我爱中国.  ")	=	我爱中国
     * </pre>
     *
     * @param str
     * @return string
     */
    public static String removePunctuation(String str) {
        if (StringUtils.isEmpty(str)) {
            return StringUtils.EMPTY;
        }

        return str.trim().replaceAll(PUNCTUATION, StringUtils.EMPTY);
    }

}

三、运行结果

    public static void main(String[] args) {
        System.out.println(ChineseUtils.getChineseInitial("我爱中国"));
        System.out.println(ChineseUtils.getChineseInitial("爱中国"));
        System.out.println(ChineseUtils.getChineseInitial("1爱中国"));
        System.out.println(ChineseUtils.getChineseInitial("中国"));
        System.out.println(ChineseUtils.getChineseInitial("@#国"));
        System.out.println(ChineseUtils.getChineseInitial("国%$"));
        System.out.println(ChineseUtils.getChineseInitial("国"));
        System.out.println(ChineseUtils.getChineseInitial("W我爱中国"));
        System.out.println(ChineseUtils.getChineseInitial("I我爱中国"));
        System.out.println(ChineseUtils.getChineseInitial("null"));
        System.out.println(ChineseUtils.getChineseInitial(null));
        System.out.println(ChineseUtils.getChineseInitial(""));
        System.out.println(ChineseUtils.getChineseInitial(","));
    }
W
A
1
Z
G
G
G
W
I
N
 https://pan.baidu.com/s/18R8DGiGAkHdtafYLBlTVvg   提取码: h4jm
两只橙 CSDN认证博客专家 TensorFlow NLP 神经网络
全球AI挑战赛百强选手,曾任职于腾讯微信事业部,魅族flyme事业部,现任中国平安AI研发工程师。《深度学习500问》作译者,CSDN博客专家及签约讲师,指弹吉他爱好者,简书专栏作家。
©️2020 CSDN 皮肤主题: 技术工厂 设计师:CSDN官方博客 返回首页
实付 59.90元
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、C币套餐、付费专栏及课程。

余额充值