三生的博客: xml文件过大

在项目里面遇到了一些被解析的xml文件超过30M 或 60M 以上的情况，现在已经不好去说为什么不在一开始产生xml的情况下就把xml 做小点，但是遇到这个问题后，我只能解决问题了，解决问题同时害怕重复发明轮子，我也去看了下现有的xml 解析东西，jdom 的SAXBuilder和 dom4j 的SAXReader都是把XML文件一次读入，xml文件过来会报溢出的异常但即使SAXParser是可以批量读入解析，但它也是一次解析完，假设XML文件中有一万条数据，解析后就必须在内存中放这么多的对象个人觉得这样有些不灵活，就自己做了个小东西来切分但前提是这个xml文件得有文件头 <?xml version="1.0" encoding="GBK"?> encoding必须跟文件编码格式一致，不然解析的时候会出乱码。

package searchRing.ring.util.xmlBufferTool;

import java.io.*;
import java.util.regex.Pattern;
import java.util.regex.Matcher;

public class XMLBufferTool {
private static final int defaultLineCount = 10;
private static final int defaultMaxOutputSize = 50;

private static final Pattern elementPattern = Pattern.compile("<[a-zA-Z]+>");
private static final Pattern charSetPattern = Pattern.compile("<[?][[0-9a-zA-Z]|[\\s]|[=]|[\"]|[.]|[-]]+[?]>");

private StringBuffer xmlContentBuffer;

/* just used to store and output the data divided */
XMLOutputBuffer xmlOutput;

private String charSetTitle = "";

private String rootElemetMark = "";

private String childElementMark = "";

InputStreamReader bufferedReader;
InputStream fileInputStream;

public XMLBufferTool(String xmlFilePath) {

this.xmlContentBuffer = new StringBuffer();

try {

this.fileInputStream = new FileInputStream(xmlFilePath);
// bufferedReader = new InputStreamReader(fileInputStream, "UTF-8");
 String charSet = getCharSet(xmlFilePath);
 if (charSet != null)
 bufferedReader = new InputStreamReader(fileInputStream, charSet);
 else
 bufferedReader = new InputStreamReader(fileInputStream);
 } catch (FileNotFoundException fe) {
 fe.printStackTrace();
 } catch (UnsupportedEncodingException uee) {
 uee.printStackTrace();
 } catch (IOException ioe) {
 ioe.printStackTrace();
 }

        try {
            preparePaser();
        } catch (IOException ie) {
            ie.printStackTrace();
        }
    }

    public String getCharSetTitle() {
        return charSetTitle;
    }

    public String getRootElemetMark() {
        return rootElemetMark;
    }

    private String getCharSet(String filePath) throws IOException {
        char temp[] = new char[512];
        FileInputStream tempInput = new FileInputStream(filePath);
        InputStreamReader tempReader = new InputStreamReader(tempInput);

int i = tempReader.read(temp);

tempReader.close();
 tempInput.close();
 if (i < 0)
 return null;

        String tempStr = new String(temp);
        Matcher m = charSetPattern.matcher(tempStr);
        if (m.find()) {
            String charSetStr = tempStr.substring(m.start(), m.end());
            Pattern tempP = Pattern.compile("[\"][[0-9a-zA-Z]|[-]]+[\"]");
            Matcher tempM = tempP.matcher(charSetStr);
            if (tempM.find()) {
                String charSet = charSetStr.substring(tempM.start(), tempM.end());
                return charSet.substring(1, charSet.length() - 1);
            }
        }

return null;
}

private void preparePaser() throws IOException {
 readSomeLine(defaultLineCount);
 Matcher m = charSetPattern.matcher(xmlContentBuffer);
 if (m.find()) {
 this.charSetTitle = this.xmlContentBuffer.substring(m.start(), m.end());
 this.xmlContentBuffer.delete(0, m.end());
 }

m = elementPattern.matcher(xmlContentBuffer);
 if (m.find()) {
 this.rootElemetMark = this.xmlContentBuffer.substring(m.start(), m.end());
 this.xmlContentBuffer.delete(0, m.end());
 }

m = elementPattern.matcher(xmlContentBuffer);
 if (m.find()) {
 this.childElementMark = this.xmlContentBuffer.substring(m.start(), m.end());
 }
 this.xmlOutput = new XMLOutputBuffer(this.childElementMark);

parserBuffer();
}

private int readSomeLine(int lineCount) throws IOException {

char buffer[] = new char[1024];
 int i = 0;
 int index = 0;
 /* be careful of the sequence of the boolean caculation */
 while (i++ < lineCount && (index = this.bufferedReader.read(buffer)) > 0) {
 xmlContentBuffer.append(buffer, 0, index);
 }

return index;

}

private void parserBuffer() {

int lastIndex = this.xmlContentBuffer.lastIndexOf(this.childElementMark);

if (lastIndex > 0) {
 this.xmlOutput.append(this.xmlContentBuffer.substring(0, lastIndex));
 this.xmlContentBuffer.delete(0, lastIndex);
 }
 }

public StringBuffer popDividedDataAfterParser() throws IOException {

while (this.xmlOutput.getItemCount() < defaultMaxOutputSize) {
 int i = readSomeLine(defaultLineCount);
 parserBuffer();
 if (i < 0)
 break;
 }

if (this.xmlOutput.getItemCount() == 0)
return null;

StringBuffer returnSB = this.xmlOutput.getXmlOutput();
 this.xmlOutput.clearBuffer();
 return returnSB.insert(0, this.rootElemetMark).append(this.rootElemetMark.replaceFirst("<", "</"));

}

public static void main(String args[]) throws Exception {
String str = "F:/ringInfoXML/ringTime.xml";

XMLBufferTool xmlb = new XMLBufferTool(str);

StringBuffer s = xmlb.popDividedDataAfterParser();
 int i = 0;
 Matcher m = Pattern.compile("<ring>").matcher(s);
 while (m.find())
 i++;

System.out.println(i);
System.out.println(s);

}

private static class XMLOutputBuffer {
 private StringBuffer xmlOutput;
 private int itemCount;

private Pattern markPattern;

XMLOutputBuffer(String markStr) {
 this.markPattern = Pattern.compile(markStr);
 xmlOutput = new StringBuffer();
 itemCount = 0;
 }

public void append(String str) {
 if (str == null || "".equals(str))
 return;
 this.xmlOutput.append(str);
 Matcher m = this.markPattern.matcher(str);
 while (m.find())
 this.itemCount++;
 }

public void clearBuffer() {
 xmlOutput = new StringBuffer();
 this.itemCount = 0;
 }

public StringBuffer getXmlOutput() {
 return xmlOutput;
 }

        public int getItemCount() {
            return itemCount;
        }
    }

}

代码中popDividedDataAfterParser() 输出的StringBuffer 可用来初始化一个 StringReader 再给dom4j 的saxReader去解析，这样联合一起用，想处理多少，就先分出来解析多少，特别适合多线程的生产者和消费者的那种情况，希望对大家有用。

三生的博客

Thursday, September 11, 2008

xml文件过大

No comments:

Blog Archive

About Me