OsmAnd/OsmAnd-java/src/main/java/net/osmand/util/TransliterationHelper.java
2019-02-20 10:15:43 +02:00

254 lines
7 KiB
Java
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package net.osmand.util;
import com.atilika.kuromoji.ipadic.Token;
import com.atilika.kuromoji.ipadic.Tokenizer;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import net.osmand.PlatformUtil;
import net.sf.junidecode.Junidecode;
import org.apache.commons.logging.Log;
public class TransliterationHelper {
private static TransliterationHelper instance;
public final static Log LOG = PlatformUtil.getLog(TransliterationHelper.class);
public final static String DEFAULT = "default";
public final static String JAPAN = "Japan";
private static String country = DEFAULT;
private static Tokenizer tokenizer;
private static Map<String, String> katakanaMap = new HashMap<>();
private TransliterationHelper() {
}
static {
try {
instance = new TransliterationHelper();
} catch (Exception e) {
LOG.debug(e.getMessage(), e);
}
}
public static TransliterationHelper getInstance() {
return instance;
}
public static void setCountry(String countryName) {
if (!countryName.equals(country)) {
switch (countryName) {
case "Japan": {
country = JAPAN;
break;
}
default:
country = DEFAULT;
break;
}
}
}
public static String getCountry() {
return country;
}
public static String transliterate(String text) {
if (tokenizer == null) {
tokenizer = new Tokenizer();
}
switch (country) {
case DEFAULT:
return Junidecode.unidecode(text);
case JAPAN:
return japanese2Romaji(text);
}
return text;
}
private static String japanese2Romaji(String text) {
boolean capitalizeWords = true;
List<Token> tokens = tokenizer.tokenize(text);
StringBuilder builder = new StringBuilder();
if (katakanaMap.isEmpty()) {
initKanaMap();
}
for (Token token : tokens) {
String type = token.getAllFeaturesArray()[1];
if (token.getAllFeaturesArray()[0].equals("記号")) {
builder.append(token.getSurface());
continue;
}
switch (token.getAllFeaturesArray()[1]) {
case "":
case "アルファベット":
case "サ変接続":
builder.append(token.getSurface());
continue;
default:
String lastFeature = token.getAllFeaturesArray()[8];
if (lastFeature.equals("*")) {
builder.append(token.getSurface());
} else {
String romaji = convertKanaToRomaji(token.getAllFeaturesArray()[8]);
if (capitalizeWords) {
builder.append(romaji.substring(0, 1).toUpperCase());
builder.append(romaji.substring(1));
} else {
if (token.getSurface()
.equals(token.getPronunciation())) {
romaji = romaji.toUpperCase();
}
builder.append(romaji);
}
}
}
builder.append(" ");
}
return builder.toString();
}
private static String convertKanaToRomaji(String s) {
StringBuilder t = new StringBuilder();
for (int i = 0; i < s.length(); i++) {
if (i <= s.length() - 2) {
if (katakanaMap.containsKey(s.substring(i, i + 2))) {
t.append(katakanaMap.get(s.substring(i, i + 2)));
i++;
} else if (katakanaMap.containsKey(s.substring(i, i + 1))) {
t.append(katakanaMap.get(s.substring(i, i + 1)));
} else if (s.charAt(i) == 'ッ') {
t.append(katakanaMap.get(s.substring(i + 1, i + 2)).charAt(0));
} else {
t.append(s.charAt(i));
}
} else {
if (katakanaMap.containsKey(s.substring(i, i + 1))) {
t.append(katakanaMap.get(s.substring(i, i + 1)));
} else {
t.append(s.charAt(i));
}
}
}
return t.toString();
}
private static void initKanaMap() {
katakanaMap.put("", "a");
katakanaMap.put("", "i");
katakanaMap.put("", "u");
katakanaMap.put("", "e");
katakanaMap.put("", "o");
katakanaMap.put("", "ka");
katakanaMap.put("", "ki");
katakanaMap.put("", "ku");
katakanaMap.put("", "ke");
katakanaMap.put("", "ko");
katakanaMap.put("", "sa");
katakanaMap.put("", "shi");
katakanaMap.put("", "su");
katakanaMap.put("", "se");
katakanaMap.put("", "so");
katakanaMap.put("", "ta");
katakanaMap.put("", "chi");
katakanaMap.put("", "tsu");
katakanaMap.put("", "te");
katakanaMap.put("", "to");
katakanaMap.put("", "na");
katakanaMap.put("", "ni");
katakanaMap.put("", "nu");
katakanaMap.put("", "ne");
katakanaMap.put("", "no");
katakanaMap.put("", "ha");
katakanaMap.put("", "hi");
katakanaMap.put("", "fu");
katakanaMap.put("", "he");
katakanaMap.put("", "ho");
katakanaMap.put("", "ma");
katakanaMap.put("", "mi");
katakanaMap.put("", "mu");
katakanaMap.put("", "me");
katakanaMap.put("", "mo");
katakanaMap.put("", "ya");
katakanaMap.put("", "yu");
katakanaMap.put("", "yo");
katakanaMap.put("", "ra");
katakanaMap.put("", "ri");
katakanaMap.put("", "ru");
katakanaMap.put("", "re");
katakanaMap.put("", "ro");
katakanaMap.put("", "wa");
katakanaMap.put("", "wo");
katakanaMap.put("", "n");
katakanaMap.put("", "ga");
katakanaMap.put("", "gi");
katakanaMap.put("", "gu");
katakanaMap.put("", "ge");
katakanaMap.put("", "go");
katakanaMap.put("", "za");
katakanaMap.put("", "ji");
katakanaMap.put("", "zu");
katakanaMap.put("", "ze");
katakanaMap.put("", "zo");
katakanaMap.put("", "da");
katakanaMap.put("", "ji");
katakanaMap.put("", "zu");
katakanaMap.put("", "de");
katakanaMap.put("", "do");
katakanaMap.put("", "ba");
katakanaMap.put("", "bi");
katakanaMap.put("", "bu");
katakanaMap.put("", "be");
katakanaMap.put("", "bo");
katakanaMap.put("", "pa");
katakanaMap.put("", "pi");
katakanaMap.put("", "pu");
katakanaMap.put("", "pe");
katakanaMap.put("", "po");
katakanaMap.put("キャ", "kya");
katakanaMap.put("キュ", "kyu");
katakanaMap.put("キョ", "kyo");
katakanaMap.put("シャ", "sha");
katakanaMap.put("シュ", "shu");
katakanaMap.put("ショ", "sho");
katakanaMap.put("チャ", "cha");
katakanaMap.put("チュ", "chu");
katakanaMap.put("チョ", "cho");
katakanaMap.put("ニャ", "nya");
katakanaMap.put("ニュ", "nyu");
katakanaMap.put("ニョ", "nyo");
katakanaMap.put("ヒャ", "hya");
katakanaMap.put("ヒュ", "hyu");
katakanaMap.put("ヒョ", "hyo");
katakanaMap.put("リャ", "rya");
katakanaMap.put("リュ", "ryu");
katakanaMap.put("リョ", "ryo");
katakanaMap.put("ギャ", "gya");
katakanaMap.put("ギュ", "gyu");
katakanaMap.put("ギョ", "gyo");
katakanaMap.put("ジャ", "ja");
katakanaMap.put("ジュ", "ju");
katakanaMap.put("ジョ", "jo");
katakanaMap.put("ティ", "ti");
katakanaMap.put("ディ", "di");
katakanaMap.put("ツィ", "tsi");
katakanaMap.put("ヂャ", "dya");
katakanaMap.put("ヂュ", "dyu");
katakanaMap.put("ヂョ", "dyo");
katakanaMap.put("ビャ", "bya");
katakanaMap.put("ビュ", "byu");
katakanaMap.put("ビョ", "byo");
katakanaMap.put("ピャ", "pya");
katakanaMap.put("ピュ", "pyu");
katakanaMap.put("ピョ", "pyo");
katakanaMap.put("", "-");
}
}