Javaで文字実体参照と数値文字参照をdecodeするver2.
Javaで文字実体参照と数値文字参照をデコード(?)する - ttmmrr(@o_tmr)の日記の修正版です。
package org.java_kuche.tmr; import java.util.Collections; import java.util.HashMap; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; public class CharRefDecode { private static final String RE_NCR = "&#(?:(([x])([0-9a-f]+))|(([0-9]+)));"; private static Map<String,String> CHARACTER_ENTITY_REFERENCES = characterEntityReferences(); private static final Pattern CHARACTER_REFERENCE = compile(CHARACTER_ENTITY_REFERENCES); private static Map<String,String> characterEntityReferences() { Map<String,String> map = new HashMap<String, String>(); map.put("&", "&"); map.put("<", "<"); map.put(">", ">"); map.put(""", "\""); map.put("'", "'"); map.put(" ", " "); return Collections.unmodifiableMap(map); } private static Pattern compile(final Map<String,String> cer) { final StringBuilder sb = new StringBuilder(); String d = "(?:("; for(final Map.Entry<String, String> e : cer.entrySet()) { sb.append(d).append(e.getKey()); // at end of loop d = "|"; } sb.append(")"); sb.append("|(?:").append(RE_NCR).append("))"); return Pattern.compile(sb.toString(), Pattern.CASE_INSENSITIVE); } private static String[] group(final Matcher matcher) { final String[] rtn = new String[6]; for (int i = 0; i < rtn.length; i++) { rtn[i] = matcher.group(i); } return rtn; } public static String decode(final String input) { if (null == input || 0 == input.length()) { return input; } final StringBuffer sb = new StringBuffer(); final Matcher matcher = CHARACTER_REFERENCE.matcher(input); while (matcher.find()) { final String[] g = group(matcher); final String replacement; if (null != g[1]) { // 文字実体参照 replacement = CHARACTER_ENTITY_REFERENCES.get(g[1]); } else if (null != g[4]) { // 数値文字参照(16進数) replacement = charCode(g[4], 16); } else { // 数値文字参照(10進数) replacement = charCode(g[5], 10); } matcher.appendReplacement( sb, Matcher.quoteReplacement(replacement) ); } matcher.appendTail(sb); return sb.toString(); } private static String charCode(final String str, final int radix) { final int parseInt = Integer.parseInt(str, radix); if (0 == (parseInt & ~0x0ffff)) { return String.valueOf((char) parseInt); } return "?"; } }
public static void main(String[] args) { String input = "abc " def : xyz & 33"; System.out.println(decode(input)); // 「オープンソースソフトウェア」という文字列を数値文字参照で10進法記述. String testNCRString10 = "オープンソ" + "ースソフトウェア"; System.out.println(decode(testNCRString10)); // 「あいうえお」という文字列を数値文字参照で16進法記述. String testNCRString16 = "あいうえお"; System.out.println(decode(testNCRString16)); System.out.println(decode("\")); System.out.println(decode("$")); }
バグや改善点などのコメントを募集中です。