OsmAnd/OsmAnd-java/src/main/java/net/osmand/util/TransliterationHelper.java

249 lines
6.6 KiB
Java
Raw Normal View History

2019-02-18 13:02:38 +01:00
package net.osmand.util;
import com.atilika.kuromoji.ipadic.Token;
import com.atilika.kuromoji.ipadic.Tokenizer;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import net.osmand.PlatformUtil;
import net.sf.junidecode.Junidecode;
import org.apache.commons.logging.Log;
public class TransliterationHelper {
private static TransliterationHelper instance;
public final static Log LOG = PlatformUtil.getLog(TransliterationHelper.class);
public final static int DEFAULT = 1000;
public final static int JAPANESE = 1001;
private static int activeMapLanguage = DEFAULT;
private Map<String, String> kanaMap = new HashMap<>();
private TransliterationHelper(){}
static {
try {
instance = new TransliterationHelper();
} catch (Exception e) {
LOG.debug(e.getMessage(), e);
}
}
public static TransliterationHelper getInstance() {
return instance;
}
public static void setActiveMapLanguage(int activeMapLanguage) {
TransliterationHelper.activeMapLanguage = activeMapLanguage;
}
public static int getActiveMapLanguage() {
return activeMapLanguage;
}
public String transliterateText(String text) {
switch (activeMapLanguage) {
case DEFAULT:
return Junidecode.unidecode(text);
case JAPANESE:
return japanese2Romaji(text);
}
return text;
}
private String japanese2Romaji(String input) {
boolean capitalizeWords = true;
Tokenizer tokenizer = new Tokenizer();
List<Token> tokens = tokenizer.tokenize(input);
StringBuilder builder = new StringBuilder();
if (kanaMap.isEmpty()) {
initKanaMap();
}
String lastTokenToMerge = "";
for (Token token : tokens) {
String type = token.getAllFeaturesArray()[1];
if (token.getAllFeaturesArray()[0].equals("記号")) {
builder.append(token.getSurface());
continue;
}
switch (token.getAllFeaturesArray()[1]) {
case "":
case "アルファベット":
case "サ変接続":
builder.append(token.getSurface());
continue;
default:
String lastFeature = token.getAllFeaturesArray()[8];
if (lastFeature.equals("*")) {
builder.append(token.getSurface());
} else {
String romaji = convertKanaToRomaji(token.getAllFeaturesArray()[8]);
if (lastFeature.endsWith("")) {
lastTokenToMerge = lastFeature;
continue;
} else {
lastTokenToMerge = "";
}
if (capitalizeWords) {
builder.append(romaji.substring(0, 1).toUpperCase());
builder.append(romaji.substring(1));
} else {
// Convert foreign katakana words to uppercase
if (token.getSurface()
.equals(token.getPronunciation())) // detect katakana
{
romaji = romaji.toUpperCase();
}
builder.append(romaji);
}
}
}
builder.append(" ");
}
return builder.toString();
}
private String convertKanaToRomaji(String s) {
StringBuilder t = new StringBuilder();
for (int i = 0; i < s.length(); i++) {
if (i <= s.length() - 2) {
if (kanaMap.containsKey(s.substring(i, i + 2))) {
t.append(kanaMap.get(s.substring(i, i + 2)));
i++;
} else if (kanaMap.containsKey(s.substring(i, i + 1))) {
t.append(kanaMap.get(s.substring(i, i + 1)));
} else if (s.charAt(i) == 'ッ') {
t.append(kanaMap.get(s.substring(i + 1, i + 2)).charAt(0));
} else {
t.append(s.charAt(i));
}
} else {
if (kanaMap.containsKey(s.substring(i, i + 1))) {
t.append(kanaMap.get(s.substring(i, i + 1)));
} else {
t.append(s.charAt(i));
}
}
}
return t.toString();
}
private void initKanaMap(){
kanaMap.put("", "a");
kanaMap.put("", "i");
kanaMap.put("", "u");
kanaMap.put("", "e");
kanaMap.put("", "o");
kanaMap.put("", "ka");
kanaMap.put("", "ki");
kanaMap.put("", "ku");
kanaMap.put("", "ke");
kanaMap.put("", "ko");
kanaMap.put("", "sa");
kanaMap.put("", "shi");
kanaMap.put("", "su");
kanaMap.put("", "se");
kanaMap.put("", "so");
kanaMap.put("", "ta");
kanaMap.put("", "chi");
kanaMap.put("", "tsu");
kanaMap.put("", "te");
kanaMap.put("", "to");
kanaMap.put("", "na");
kanaMap.put("", "ni");
kanaMap.put("", "nu");
kanaMap.put("", "ne");
kanaMap.put("", "no");
kanaMap.put("", "ha");
kanaMap.put("", "hi");
kanaMap.put("", "fu");
kanaMap.put("", "he");
kanaMap.put("", "ho");
kanaMap.put("", "ma");
kanaMap.put("", "mi");
kanaMap.put("", "mu");
kanaMap.put("", "me");
kanaMap.put("", "mo");
kanaMap.put("", "ya");
kanaMap.put("", "yu");
kanaMap.put("", "yo");
kanaMap.put("", "ra");
kanaMap.put("", "ri");
kanaMap.put("", "ru");
kanaMap.put("", "re");
kanaMap.put("", "ro");
kanaMap.put("", "wa");
kanaMap.put("", "wo");
kanaMap.put("", "n");
kanaMap.put("", "ga");
kanaMap.put("", "gi");
kanaMap.put("", "gu");
kanaMap.put("", "ge");
kanaMap.put("", "go");
kanaMap.put("", "za");
kanaMap.put("", "ji");
kanaMap.put("", "zu");
kanaMap.put("", "ze");
kanaMap.put("", "zo");
kanaMap.put("", "da");
kanaMap.put("", "ji");
kanaMap.put("", "zu");
kanaMap.put("", "de");
kanaMap.put("", "do");
kanaMap.put("", "ba");
kanaMap.put("", "bi");
kanaMap.put("", "bu");
kanaMap.put("", "be");
kanaMap.put("", "bo");
kanaMap.put("", "pa");
kanaMap.put("", "pi");
kanaMap.put("", "pu");
kanaMap.put("", "pe");
kanaMap.put("", "po");
kanaMap.put("キャ", "kya");
kanaMap.put("キュ", "kyu");
kanaMap.put("キョ", "kyo");
kanaMap.put("シャ", "sha");
kanaMap.put("シュ", "shu");
kanaMap.put("ショ", "sho");
kanaMap.put("チャ", "cha");
kanaMap.put("チュ", "chu");
kanaMap.put("チョ", "cho");
kanaMap.put("ニャ", "nya");
kanaMap.put("ニュ", "nyu");
kanaMap.put("ニョ", "nyo");
kanaMap.put("ヒャ", "hya");
kanaMap.put("ヒュ", "hyu");
kanaMap.put("ヒョ", "hyo");
kanaMap.put("リャ", "rya");
kanaMap.put("リュ", "ryu");
kanaMap.put("リョ", "ryo");
kanaMap.put("ギャ", "gya");
kanaMap.put("ギュ", "gyu");
kanaMap.put("ギョ", "gyo");
kanaMap.put("ジャ", "ja");
kanaMap.put("ジュ", "ju");
kanaMap.put("ジョ", "jo");
kanaMap.put("ティ", "ti");
kanaMap.put("ディ", "di");
kanaMap.put("ツィ", "tsi");
kanaMap.put("ヂャ", "dya");
kanaMap.put("ヂュ", "dyu");
kanaMap.put("ヂョ", "dyo");
kanaMap.put("ビャ", "bya");
kanaMap.put("ビュ", "byu");
kanaMap.put("ビョ", "byo");
kanaMap.put("ピャ", "pya");
kanaMap.put("ピュ", "pyu");
kanaMap.put("ピョ", "pyo");
kanaMap.put("", "-");
}
}