Javaで文字実体参照と数値文字参照をdecodeするver2.

Javaで文字実体参照と数値文字参照をデコード(?)する - ttmmrr(@o_tmr)の日記の修正版です。

package org.java_kuche.tmr;

import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class CharRefDecode {

  private static final String RE_NCR = "&#(?:(([x])([0-9a-f]+))|(([0-9]+)));";

  private static Map<String,String> CHARACTER_ENTITY_REFERENCES = characterEntityReferences();

  private static final Pattern CHARACTER_REFERENCE = compile(CHARACTER_ENTITY_REFERENCES);

  private static Map<String,String> characterEntityReferences() {
    Map<String,String> map = new HashMap<String, String>();
    map.put("&amp;", "&");
    map.put("&lt;", "<");
    map.put("&gt;", ">");
    map.put("&quot;", "\"");
    map.put("&apos;", "'");
    map.put("&nbsp;", " ");
    return Collections.unmodifiableMap(map);
  }

  private static Pattern compile(final Map<String,String> cer) {
    final StringBuilder sb = new StringBuilder();
    String d = "(?:(";
    for(final Map.Entry<String, String> e : cer.entrySet()) {
      sb.append(d).append(e.getKey());
      // at end of loop
      d = "|";
    }
    sb.append(")");
    sb.append("|(?:").append(RE_NCR).append("))");
    return Pattern.compile(sb.toString(), Pattern.CASE_INSENSITIVE);
  }

  private static String[] group(final Matcher matcher) {
    final String[] rtn = new String[6];
    for (int i = 0; i < rtn.length; i++) {
      rtn[i] = matcher.group(i);
    }
    return rtn;
  }

  public static String decode(final String input) {
    if (null == input || 0 == input.length()) {
      return input;
    }
    final StringBuffer sb = new StringBuffer();
    final Matcher matcher = CHARACTER_REFERENCE.matcher(input);
    while (matcher.find()) {
      final String[] g = group(matcher);
      final String replacement;
      if (null != g[1]) {
        // 文字実体参照
        replacement = CHARACTER_ENTITY_REFERENCES.get(g[1]);
      } else if (null != g[4]) {
        // 数値文字参照(16進数)
        replacement = charCode(g[4], 16);
      } else {
        // 数値文字参照(10進数)
        replacement = charCode(g[5], 10);
      }
      matcher.appendReplacement(
          sb,
          Matcher.quoteReplacement(replacement)
      );
    }
    matcher.appendTail(sb);
    return sb.toString();
  }

  private static String charCode(final String str, final int radix) {
    final int parseInt = Integer.parseInt(str, radix);
    if (0 == (parseInt & ~0x0ffff)) {
      return String.valueOf((char) parseInt);
    }
    return "?";
  }
}
  public static void main(String[] args) {
    String input = "abc &#34; def &#x3a; xyz &amp; 33";
    System.out.println(decode(input));
    // 「オープンソースソフトウェア」という文字列を数値文字参照で10進法記述.
    String testNCRString10 = "&#12458;&#12540;&#12503;&#12531;&#12477;" +
        "&#12540;&#12473;&#12477;&#12501;&#12488;&#12454;&#12455;&#12450;";
    System.out.println(decode(testNCRString10));
    // 「あいうえお」という文字列を数値文字参照で16進法記述.
    String testNCRString16 = "&#x3042;&#x3044;&#x3046;&#x3048;&#X304A;";
    System.out.println(decode(testNCRString16));
    System.out.println(decode("&#92;"));
    System.out.println(decode("&#x24;"));
  }

バグや改善点などのコメントを募集中です。