Chuyển Html Sang XML | Võ Văn Hải's Blog

Sau đây là lớp với các phương thức để chuyển các url/file/text dạng HTML sang XML.

Bài viết tham khảo từ  trang http://sourceforge.net/projects/light-html2xml sau đó thêm các phương thức utils.

Chúc các bạn thành công!

package vovanhai.wordpress.com;

import java.io.BufferedReader;

import java.io.FileReader;

import java.io.FileWriter;

import java.io.InputStream;

import java.io.PrintWriter;

import java.net.URL;

import java.util.ArrayList;

import java.util.HashMap;

import java.util.List;

import java.util.Scanner;

import java.util.Stack;

public class Html2Xml {

enum states {text, tag, endtag, attrtext, script, endscript, specialtag, comment, skipcdata, entity, namedentity, numericentity, hexaentity, tillgt, tillquote, tillinst, andgt};

private static HashMap<String, Integer> namedentities = new HashMap<String, Integer>();

private static List<String> emptytags = new ArrayList<String>();

private static HashMap<String, List<String>> autoclosetags = new HashMap<String, List<String>>();

private String Convert2XML(String s) {

namedentities.put(“AElig”, 198);

namedentities.put(“Aacute”, 193);

namedentities.put(“Acirc”, 194);

namedentities.put(“Agrave”, 192);

namedentities.put(“Alpha”, 913);

namedentities.put(“Aring”, 197);

namedentities.put(“Atilde”, 195);

namedentities.put(“Auml”, 196);

namedentities.put(“Beta”, 914);

namedentities.put(“Ccedil”, 199);

namedentities.put(“Chi”, 935);

namedentities.put(“Dagger”, 8225);

namedentities.put(“Delta”, 916);

namedentities.put(“ETH”, 208);

namedentities.put(“Eacute”, 201);

namedentities.put(“Ecirc”, 202);

namedentities.put(“Egrave”, 200);

namedentities.put(“Epsilon”, 917);

namedentities.put(“Eta”, 919);

namedentities.put(“Euml”, 203);

namedentities.put(“Gamma”, 915);

namedentities.put(“Iacute”, 205);

namedentities.put(“Icirc”, 206);

namedentities.put(“Igrave”, 204);

namedentities.put(“Iota”, 921);

namedentities.put(“Iuml”, 207);

namedentities.put(“Kappa”, 922);

namedentities.put(“Lambda”, 923);

namedentities.put(“Mu”, 924);

namedentities.put(“Ntilde”, 209);

namedentities.put(“Nu”, 925);

namedentities.put(“OElig”, 338);

namedentities.put(“Oacute”, 211);

namedentities.put(“Ocirc”, 212);

namedentities.put(“Ograve”, 210);

namedentities.put(“Omega”, 937);

namedentities.put(“Omicron”, 927);

namedentities.put(“Oslash”, 216);

namedentities.put(“Otilde”, 213);

namedentities.put(“Ouml”, 214);

namedentities.put(“Phi”, 934);

namedentities.put(“Pi”, 928);

namedentities.put(“Prime”, 8243);

namedentities.put(“Psi”, 936);

namedentities.put(“Rho”, 929);

namedentities.put(“Scaron”, 352);

namedentities.put(“Sigma”, 931);

namedentities.put(“THORN”, 222);

namedentities.put(“Tau”, 932);

namedentities.put(“Theta”, 920);

namedentities.put(“Uacute”, 218);

namedentities.put(“Ucirc”, 219);

namedentities.put(“Ugrave”, 217);

namedentities.put(“Upsilon”, 933);

namedentities.put(“Uuml”, 220);

namedentities.put(“Xi”, 926);

namedentities.put(“Yacute”, 221);

namedentities.put(“Yuml”, 376);

namedentities.put(“Zeta”, 918);

namedentities.put(“aacute”, 225);

namedentities.put(“acirc”, 226);

namedentities.put(“acute”, 180);

namedentities.put(“aelig”, 230);

namedentities.put(“agrave”, 224);

namedentities.put(“alpha”, 945);

namedentities.put(“and”, 8743);

namedentities.put(“ang”, 8736);

namedentities.put(“aring”, 229);

namedentities.put(“asymp”, 8776);

namedentities.put(“atilde”, 227);

namedentities.put(“auml”, 228);

namedentities.put(“bdquo”, 8222);

namedentities.put(“beta”, 946);

namedentities.put(“brvbar”, 166);

namedentities.put(“bull”, 8226);

namedentities.put(“cap”, 8745);

namedentities.put(“ccedil”, 231);

namedentities.put(“cedil”, 184);

namedentities.put(“cent”, 162);

namedentities.put(“chi”, 967);

namedentities.put(“circ”, 710);

namedentities.put(“clubs”, 9827);

namedentities.put(“cong”, 8773);

namedentities.put(“copy”, 169);

namedentities.put(“crarr”, 8629);

namedentities.put(“cup”, 8746);

namedentities.put(“curren”, 164);

namedentities.put(“dagger”, 8224);

namedentities.put(“darr”, 8595);

namedentities.put(“deg”, 176);

namedentities.put(“delta”, 948);

namedentities.put(“diams”, 9830);

namedentities.put(“divide”, 247);

namedentities.put(“eacute”, 233);

namedentities.put(“ecirc”, 234);

namedentities.put(“egrave”, 232);

namedentities.put(“empty”, 8709);

namedentities.put(“emsp”, 8195);

namedentities.put(“ensp”, 8194);

namedentities.put(“epsilon”, 949);

namedentities.put(“equiv”, 8801);

namedentities.put(“eta”, 951);

namedentities.put(“eth”, 240);

namedentities.put(“euml”, 235);

namedentities.put(“euro”, 8364);

namedentities.put(“exists”, 8707);

namedentities.put(“fnof”, 402);

namedentities.put(“forall”, 8704);

namedentities.put(“frac12”, 189);

namedentities.put(“frac14”, 188);

namedentities.put(“frac34”, 190);

namedentities.put(“gamma”, 947);

namedentities.put(“ge”, 8805);

namedentities.put(“harr”, 8596);

namedentities.put(“hearts”, 9829);

namedentities.put(“hellip”, 8230);

namedentities.put(“iacute”, 237);

namedentities.put(“icirc”, 238);

namedentities.put(“iexcl”, 161);

namedentities.put(“igrave”, 236);

namedentities.put(“infin”, 8734);

namedentities.put(“int”, 8747);

namedentities.put(“iota”, 953);

namedentities.put(“iquest”, 191);

namedentities.put(“isin”, 8712);

namedentities.put(“iuml”, 239);

namedentities.put(“kappa”, 954);

namedentities.put(“lambda”, 923);

namedentities.put(“laquo”, 171);

namedentities.put(“larr”, 8592);

namedentities.put(“lceil”, 8968);

namedentities.put(“ldquo”, 8220);

namedentities.put(“le”, 8804);

namedentities.put(“lfloor”, 8970);

namedentities.put(“lowast”, 8727);

namedentities.put(“loz”, 9674);

namedentities.put(“lrm”, 8206);

namedentities.put(“lsaquo”, 8249);

namedentities.put(“lsquo”, 8216);

namedentities.put(“macr”, 175);

namedentities.put(“mdash”, 8212);

namedentities.put(“micro”, 181);

namedentities.put(“middot”, 183);

namedentities.put(“minus”, 8722);

namedentities.put(“mu”, 956);

namedentities.put(“nabla”, 8711);

namedentities.put(“nbsp”, 160);

namedentities.put(“ndash”, 8211);

namedentities.put(“ne”, 8800);

namedentities.put(“ni”, 8715);

namedentities.put(“not”, 172);

namedentities.put(“notin”, 8713);

namedentities.put(“nsub”, 8836);

namedentities.put(“ntilde”, 241);

namedentities.put(“nu”, 925);

namedentities.put(“oacute”, 243);

namedentities.put(“ocirc”, 244);

namedentities.put(“oelig”, 339);

namedentities.put(“ograve”, 242);

namedentities.put(“oline”, 8254);

namedentities.put(“omega”, 969);

namedentities.put(“omicron”, 959);

namedentities.put(“oplus”, 8853);

namedentities.put(“or”, 8744);

namedentities.put(“ordf”, 170);

namedentities.put(“ordm”, 186);

namedentities.put(“oslash”, 248);

namedentities.put(“otilde”, 245);

namedentities.put(“otimes”, 8855);

namedentities.put(“ouml”, 246);

namedentities.put(“para”, 182);

namedentities.put(“part”, 8706);

namedentities.put(“permil”, 8240);

namedentities.put(“perp”, 8869);

namedentities.put(“phi”, 966);

namedentities.put(“pi”, 960);

namedentities.put(“piv”, 982);

namedentities.put(“plusmn”, 177);

namedentities.put(“pound”, 163);

namedentities.put(“prime”, 8242);

namedentities.put(“prod”, 8719);

namedentities.put(“prop”, 8733);

namedentities.put(“psi”, 968);

namedentities.put(“radic”, 8730);

namedentities.put(“raquo”, 187);

namedentities.put(“rarr”, 8594);

namedentities.put(“rceil”, 8969);

namedentities.put(“rdquo”, 8221);

namedentities.put(“reg”, 174);

namedentities.put(“rfloor”, 8971);

namedentities.put(“rho”, 961);

namedentities.put(“rlm”, 8207);

namedentities.put(“rsaquo”, 8250);

namedentities.put(“rsquo”, 8217);

namedentities.put(“sbquo”, 8218);

namedentities.put(“scaron”, 353);

namedentities.put(“sdot”, 8901);

namedentities.put(“sect”, 167);

namedentities.put(“shy”, 173);

namedentities.put(“sigma”, 963);

namedentities.put(“sigmaf”, 962);

namedentities.put(“sim”, 8764);

namedentities.put(“spades”, 9824);

namedentities.put(“sub”, 8834);

namedentities.put(“sube”, 8838);

namedentities.put(“sum”, 8721);

namedentities.put(“sup”, 8835);

namedentities.put(“sup1”, 185);

namedentities.put(“sup3”, 179);

namedentities.put(“supe”, 8839);

namedentities.put(“szlig”, 223);

namedentities.put(“tau”, 964);

namedentities.put(“there4”, 8756);

namedentities.put(“theta”, 952);

namedentities.put(“thetasym”, 977);

namedentities.put(“thinsp”, 8201);

namedentities.put(“thorn”, 254);

namedentities.put(“tilde”, 732);

namedentities.put(“times”, 215);

namedentities.put(“trade”, 8482);

namedentities.put(“uacute”, 250);

namedentities.put(“uarr”, 8593);

namedentities.put(“ucirc”, 251);

namedentities.put(“ugrave”, 249);

namedentities.put(“uml”, 168);

namedentities.put(“up2”, 178);

namedentities.put(“upsih”, 978);

namedentities.put(“upsilon”, 965);

namedentities.put(“uuml”, 252);

namedentities.put(“xi”, 958);

namedentities.put(“yacute”, 253);

namedentities.put(“yen”, 165);

namedentities.put(“yuml”, 255);

namedentities.put(“zeta”, 950);

namedentities.put(“zwj”, 8205);

namedentities.put(“zwnj”, 8204);

emptytags.add(“area”);

emptytags.add(“base”);

emptytags.add(“basefont”);

emptytags.add(“br”);

emptytags.add(“col”);

emptytags.add(“frame”);

emptytags.add(“hr”);

emptytags.add(“img”);

emptytags.add(“input”);

emptytags.add(“isindex”);

emptytags.add(“link”);

emptytags.add(“meta”);

emptytags.add(“param”);

autoclosetags.put(“basefont”, new ArrayList<String>());

autoclosetags.get(“basefont”).add(“basefont”);

autoclosetags.put(“colgroup”, new ArrayList<String>());

autoclosetags.get(“colgroup”).add(“colgroup”);

autoclosetags.put(“dd”, new ArrayList<String>());

autoclosetags.get(“dd”).add(“colgroup”);

autoclosetags.put(“dt”, new ArrayList<String>());

autoclosetags.get(“dt”).add(“dt”);

autoclosetags.put(“li”, new ArrayList<String>());

autoclosetags.get(“li”).add(“li”);

autoclosetags.put(“p”, new ArrayList<String>());

autoclosetags.get(“p”).add(“p”);

autoclosetags.put(“thead”, new ArrayList<String>());

autoclosetags.get(“thead”).add(“tbody”);

autoclosetags.get(“thead”).add(“tfoot”);

autoclosetags.put(“tbody”, new ArrayList<String>());

autoclosetags.get(“tbody”).add(“thead”);

autoclosetags.get(“tbody”).add(“tfoot”);

autoclosetags.put(“tfoot”, new ArrayList<String>());

autoclosetags.get(“tfoot”).add(“thead”);

autoclosetags.get(“tfoot”).add(“tbody”);

autoclosetags.put(“th”, new ArrayList<String>());

autoclosetags.get(“th”).add(“td”);

autoclosetags.put(“td”, new ArrayList<String>());

autoclosetags.get(“td”).add(“th”);

autoclosetags.get(“td”).add(“td”);

autoclosetags.put(“tr”, new ArrayList<String>());

autoclosetags.get(“tr”).add(“tr”);

String r2 = “”;

String r = “”;

int limit = s.length();

states state = states.text;

states prevstate = state;

Stack<String> opentags = new Stack<String>();

String name = “”;

String tagname = “”;

String attrname = “”;

String attrs = “”;

List<String> attrnames = new ArrayList<String>();

int entvalue = 0;

char attrdelim = ‘”‘;

String attrvalue = “”;

String cs = “”;

char prec = ‘ ‘;

char preprec = ‘ ‘;

char c = ‘ ‘;

int start = 0;

String encoding = “”;

if (s.charAt(0) == 0xEF && s.charAt(1) == 0xBB && s.charAt(2)== 0xBF) {

encoding = “utf-8”;

start = 3;

} else {

encoding = “iso-8859-1”;

start = 0;

}

for (int i = start; i < limit && ((r2.equals(“”) && r.equals(“”)) || !opentags.empty()); i++)  {

if (r.length() > 10240)  {

r2 += r;

r = “”;

}

c = s.charAt(i);

switch (state) {

case text:

if (c == ‘<‘)  {

name = “”;

tagname = “”;

attrname = “”;

attrs = “”;

attrnames.clear();

state = states.tag;

break;

}

if (!Character.isWhitespace(c) && opentags.empty())  {

r += “<html>”;

opentags.push(“html”);

}

if (Character.isWhitespace(c) && opentags.empty())  {

break;

}

if (c == ‘&’)  {

name = “”;

entvalue = 0;

prevstate = state;

state = states.entity;

break;

}

r += c;

break;

case tag:

if (c == ‘?’ && tagname.equals(“”)) {

state = states.tillinst;

break;

}

if (c == ‘!’ && tagname.equals(“”)) {

state = states.specialtag;

prec = ‘ ‘;

break;

}

if (c == ‘/’ && name.equals(“”) && tagname.equals(“”)) {

state = states.endtag;

name = “”;

break;

}

if (Character.isWhitespace(c))  {

if (name.equals(“”))  {

break;

}

if (tagname.equals(“”) && name != “_”)  {

tagname = name;

name = “”;

break;

}

if (attrname.equals(“”)) {

attrname = name.toLowerCase();

name = “”;

break;

}

break;

}

if (c == ‘=’)  {

if (attrname.equals(“”)) {

attrname = name.toLowerCase();

name = “”;

}

state = states.tillquote;

break;

}

if (c == ‘/’ && (!tagname.equals(“”) || !name.equals(“”))) {

if (tagname.equals(“”)) {

tagname = name;

}

tagname = tagname.toLowerCase();

if (!tagname.equals(“html”) && opentags.empty()) {

r += “<html>”;

opentags.push(“html”);

}

if (autoclosetags.containsKey(tagname) && !opentags.empty())  {

String prevtag = opentags.peek();

if (autoclosetags.get(tagname).contains(prevtag))  {

opentags.pop();

r += “</” + prevtag + “>”;

}

}

if (tagname.equals(“tr”) && opentags.peek().equals(“table”)) {

r += “<tbody>”;

opentags.push(“tbody”);

}

r += “<” + tagname + attrs + “/>”;

state = states.tillgt;

break;

}

if (c == ‘>’)  {

if (tagname.equals(“”) && !name.equals(“”)) {

tagname = name;

}

if (!tagname.equals(“”)) {

tagname = tagname.toLowerCase();

if (!tagname.equals(“html”) && opentags.empty())  {

r += “<html>”;

opentags.push(“html”);

}

if (autoclosetags.containsKey(tagname) && !opentags.empty())  {

String prevtag = opentags.peek();

if (autoclosetags.get(tagname).contains(prevtag))  {

opentags.pop();

r += “</” + prevtag + “>”;

}

}

if (tagname.equals(“tr”) && opentags.peek().equals(“table”)) {

r += “<tbody>”;

opentags.push(“tbody”);

}

if (emptytags.contains(tagname)) {

r += “<” + tagname.toLowerCase() + attrs + “/>”;

} else {

opentags.push(tagname);

r += “<” + tagname + attrs + “>”;

if (tagname.equals(“script”)) {

r += “<![CDATA[“;

opentags.pop();

state = states.script;

break;

}

}

state = states.text;

break;

}

}

if (attrname.equals(“_”)) {

while(attrnames.contains(attrname)) {

attrname += “_”;

}

}

if (!attrname.equals(“”) && !attrnames.contains(attrname) && !attrname.equals(“xmlns”)) {

attrs += ” ” + attrname + “=\”” + attrname + “\””;

attrname = “”;

}

cs = “” + c;

name += (Character.isLetterOrDigit(c) && name != “”) || Character.isLetter(c) ? cs : (name.equals(“”) ? “_” : (c == ‘-‘ ? “-” : (!name.equals(“_”) ? “_” : “”)));

break;

case endtag:

if (c == ‘>’) {

name = name.toLowerCase();

if (opentags.search(name) != –1) {

String prevtag;

while (!(prevtag = opentags.pop()).equals(name)) {

r += “</” + prevtag + “>”;

}

r += “</” + name + “>”;

} else {

if (!name.equals(“html”) && opentags.empty()) {

r += “<html>”;

opentags.push(“html”);

}

}

state = states.text;

break;

}

if (Character.isWhitespace(c)) {

break;

}

cs = “” + c;

name += Character.isLetterOrDigit(c) ? cs : !name.equals(“_”) ? “_” : “”;

break;

case attrtext:

if (c == attrdelim || (Character.isWhitespace(c) && attrdelim == ‘ ‘)) {

if (attrname.equals(“_”)) {

while(attrnames.contains(attrname)) {

attrname += “_”;

}

}

if (!attrnames.contains(attrname) && !attrname.equals(“xmlns”)) {

attrnames.add(attrname);

attrs += ” ” + attrname + “=\”” + attrvalue + “\””;

}

attrname = “”;

state = states.tag;

break;

}

if (attrdelim == ‘ ‘ && (c == ‘/’ || c == ‘>’)) {

tagname = tagname.toLowerCase();

if (!tagname.equals(“html”) && opentags.empty()) {

r += “<html>”;

opentags.push(“html”);

}

if (autoclosetags.containsKey(tagname) && !opentags.empty()) {

String prevtag = opentags.peek();

if (autoclosetags.get(tagname).contains(prevtag)) {

opentags.pop();

r += “</” + prevtag + “>”;

}

}

if (attrname.equals(“_”)) {

while(attrnames.contains(attrname)) {

attrname += “_”;

}

}

if (!attrnames.contains(attrname) && !attrname.equals(“xmlns”)) {

attrnames.add(attrname);

attrs += ” ” + attrname + “=\”” + attrvalue + “\””;

}

attrname = “”;

if (c == ‘/’) {

r += “<” + tagname + attrs + “/>”;

state = states.tillgt;

break;

}

if (c == ‘>’) {

if (emptytags.contains(tagname)) {

r += “<” + tagname + attrs + “/>”;

state = states.text;

break;

} else {

opentags.push(tagname);

r += “<” + tagname + attrs + “>”;

if (tagname.equals(“script”)) {

r += “<![CDATA[“;

opentags.pop();

prec = ‘ ‘;

preprec = ‘ ‘;

state = states.script;

break;

}

state = states.text;

break;

}

}

}

if (c == ‘&’) {

name = “”;

entvalue = 0;

prevstate = state;

state = states.entity;

break;

}

cs = “” + c;

attrvalue += c == ‘”‘ ? “&quot;” : c == ‘\” ? “&apos;” : cs;

break;

case script:

if (c == ‘/’ && prec == ‘<‘) {

state = states.endscript;

name = “”;

break;

}

if (c == ‘[‘ && prec == ‘!’ && preprec == ‘<‘) {

state = states.skipcdata;

name = “<![“;

break;

}

if (c == ‘>’ && prec == ‘]’ && preprec == ‘]’) {

c = r.charAt(r.length() 3);

r = r.substring(0, r.length() 4);

}

r += c;

preprec = prec;

prec = c;

break;

case endscript:

if (c == ‘>’ && name.toLowerCase().equals(“script”)) {

r = r.substring(0, r.length() 1);

r += “]]></script>”;

state = states.text;

break;

}

name += c;

String sscr = “script”;

if (!sscr.startsWith(name.toLowerCase())) {

r += name;

state = states.script;

}

break;

case specialtag:

if (c != ‘-‘) {

state = states.tillgt;

break;

}

if (prec == ‘-‘) {

state = states.comment;

preprec = ‘ ‘;

break;

}

prec = c;

break;

case comment:

if (c == ‘>’ && prec == ‘-‘ && preprec == ‘-‘) {

state = states.text;

break;

}

preprec = prec;

prec = c;

break;

case skipcdata:

if (name.equals(“<![CDATA[“)) {

state = states.script;

break;

}

name += c;

String scdata = “<![CDATA[“;

if (!scdata.startsWith(name)) {

r += name;

state = states.script;

}

break;

case entity:

if (c == ‘#’) {

state = states.numericentity;

break;

}

name += c;

state = states.namedentity;

break;

case numericentity:

if (c == ‘x’ || c == ‘X’) {

state = states.hexaentity;

break;

}

if (c == ‘;’) {

String ent = “&#” + entvalue + “;”;

if (prevstate == states.text) {

r += ent;

} else {

attrvalue += ent;

}

state = prevstate;

break;

}

entvalue = entvalue * 10 + c – ‘0’;

break;

case hexaentity:

if (c == ‘;’) {

String ent = “&#” + entvalue + “;”;

if (prevstate == states.text)  {

r += ent;

} else {

attrvalue += ent;

}

state = prevstate;

break;

}

entvalue = entvalue * 16 + (Character.isDigit(c) ? c – ‘0’ : Character.toUpperCase(c) ‘A’);

break;

case namedentity:

if (c == ‘;’) {

String ent;

name = name.toLowerCase();

if (name.equals(“amp”) || name.equals(“lt”) || name.equals(“gt”) || name.equals(“quot”) || name.equals(“apos”)) {

ent = “&” + name + “;”;

name = “”;

if (prevstate == states.text) {

r += ent;

} else {

attrvalue += ent;

}

state = prevstate;

break;

}

if (namedentities.containsKey(name)) {

entvalue = namedentities.get(name);

} else {

entvalue = 0;

}

ent = “&#” + entvalue + “;”;

name = “”;

if (prevstate == states.text) {

r += ent;

} else {

attrvalue += ent;

}

state = prevstate;

break;

}

if (!Character.isLetterOrDigit(c) || name.length() > 6) {

String ent = “&amp;” + name;

name = “”;

if (prevstate == states.text) {

r += ent;

} else {

attrvalue += ent;

}

state = prevstate;

i–;

break;

}

name += c;

break;

case tillinst:

if (c == ‘?’) {

state = states.andgt;

}

break;

case andgt:

if (c == ‘>’) {

state = states.text;

break;

}

state = states.tillinst;

break;

case tillgt:

if (c == ‘>’) {

state = states.text;

}

break;

case tillquote:

if (Character.isWhitespace(c)) {

break;

}

if (c == ‘”‘ || c == ‘\”) {

attrdelim = c;

attrvalue = “”;

state = states.attrtext;

break;

}

if (c == ‘/’ || c == ‘>’) {

if (attrname.equals(“_”)) {

while(attrnames.contains(attrname)) {

attrname += “_”;

}

}

if (!attrnames.contains(attrname) && !attrname.equals(“xmlns”)) {

attrnames.add(attrname);

attrs += ” ” + attrname + “=\”” + attrvalue + “\””;

}

attrname = “”;

}

if (c == ‘/’) {

r += “<” + tagname.toLowerCase() + attrs + “/>”;

state = states.tillgt;

break;

}

if (c == ‘>’) {

tagname = tagname.toLowerCase();

if (!tagname.equals(“html”) && opentags.empty()) {

r += “<html>”;

opentags.push(“html”);

}

if (autoclosetags.containsKey(tagname) && !opentags.empty())  {

String prevtag = opentags.peek();

if (autoclosetags.get(tagname).contains(prevtag)) {

opentags.pop();

r += “</” + prevtag + “>”;

}

}

if (emptytags.contains(tagname)) {

r += “<” + tagname + attrs + “/>”;

state = states.text;

break;

} else {

opentags.push(tagname);

r += “<” + tagname + attrs + “>”;

if (tagname.equals(“script”)) {

r += “<![CDATA[“;

opentags.pop();

state = states.script;

break;

}

}

}

attrdelim = ‘ ‘;

attrvalue = “” + c;

state = states.attrtext;

break;

}

}

while (!opentags.empty())  {

r += “</” + opentags.pop() + “>”;

}

r2 += r;

return “<?xml version=\”1.0\” encoding=\”” + encoding + “\”?>\n” + r2;

}

/**

* Ðọc 1 URL sau ðó chuyển thành XML rồi lưu xuống file

* @param address là ðịa chỉ cần ðọc

* @param xmlFileName là file xml chỉ ðịnh ðể lưu

* @throws Exception

*/

public void URL2XML(String address,String xmlFileName) throws Exception{

URL url=new URL(address);

InputStream inStream = url.openStream();

Scanner in = new Scanner(inStream);

String s=“”;

while(in.hasNextLine()){

s+=in.nextLine()+ “\n”;

}

inStream.close();

in.close();

if(!xmlFileName.endsWith(“.xml”))

xmlFileName+=“.xml”;

FileWriter fw  = new FileWriter(xmlFileName);

PrintWriter pw  = new PrintWriter(fw,true);

pw.print(Convert2XML(s));

pw.close();

}

/**

* Ðọc 1 file HTML trên ðĩa sau ðó biến thành xml file rồi lưu xuống ðĩa

* @param htmlFilePath là đường dẫn tuyệt đối đến file html

* @param xmlFileName là file xml chỉ ðịnh ðể lưu

* @throws Exception

*/

public void HtmlFile2XML(String htmlFilePath,String xmlFileName) throws Exception{

FileReader fr = new FileReader(htmlFilePath);

BufferedReader br = new BufferedReader(fr);

String s = “”;

while (br.ready()) {

s += br.readLine() + “\n”;

}

br.close();

fr.close();

if(!xmlFileName.endsWith(“.xml”))

xmlFileName+=“.xml”;

FileWriter fw  = new FileWriter(xmlFileName);

PrintWriter pw  = new PrintWriter(fw,true);

pw.print(Convert2XML(s));

pw.close();

}

/**

* Chuyển 1 chuỗi html ra dạng xml

* @param htmlString chuỗi chứa ðịnh dạng html

* @param xmlFileName là file xml chỉ ðịnh ðể lưu

* @throws Exception

*/

public String StringPattern2XML(String htmlString,String xmlFileName,boolean toFile) throws Exception{

if(!toFile)

return Convert2XML(htmlString);

if(!xmlFileName.endsWith(“.xml”))

xmlFileName+=“.xml”;

FileWriter fw  = new FileWriter(xmlFileName);

PrintWriter pw  = new PrintWriter(fw,true);

pw.print(Convert2XML(htmlString));

pw.close();

return “”;

}

/**

* Testing

* @param args

*/

public static void main (String[] args){

try {

Html2Xml h2x=new Html2Xml();

String add=“http://www.bk4.com.vn&#8221;;

System.out.println(“starting convert. Please wait…”);

h2x.URL2XML(add, “xxxx.xml”);

System.out.println(“convert completed…”);

/*String s=”<html><body><body/></html>”;//<></>

String kq=h2x.StringPattern2XML(s, “”, false);

System.out.println(kq);*/

} catch (Exception e) {

e.printStackTrace();

}

}

}

Tài liệu tham khảo:

http://www.ibm.com/developerworks/web/library/x-html5xhtml2.html?S_TACT=105AGX08&S_CMP=EDU

Share this:

  • Facebook
  • X
Like Loading...

Từ khóa » Chuyển đổi Từ Html Sang Xml