Thursday, September 11, 2008

xml文件过大

在项目里面遇到了一些被解析的xml文件超过30M 或 60M 以上的情况, 现在已经不好去说为什么不在一开始产生xml的情况下就把xml 做小点,但是遇到这个问题后,我只能解决问题了,解决问题同时害怕重复发明轮子,我也去看了下现有的xml 解析东西,jdom 的SAXBuilder和 dom4j 的SAXReader都是把XML文件一次读入,xml文件过来 会报溢出的异常 但即使SAXParser是可以批量读入解析,但它也是一次解析完,假设XML文件中有一万条数据,解析后就必须在内存中放这么多的对象 个人觉得这样有些不灵活,就自己做了个小东西来切分 但前提是这个xml文件得有文件头 <?xml version="1.0" encoding="GBK"?> encoding必须跟文件编码格式一致 ,不然解析的时候会出乱码。
 

package searchRing.ring.util.<SPAN class=hilite1>xml</SPAN>BufferTool;

import java.io.*;
import java.util.regex.Pattern;
import java.util.regex.Matcher;


public class <SPAN class=hilite1>XML</SPAN>BufferTool {
    private static final int defaultLineCount = 10;
    private static final int defaultMaxOutputSize = 50;

    private static final Pattern elementPattern = Pattern.compile("<[a-zA-Z]+>");
    private static final Pattern charSetPattern = Pattern.compile("<[?][[0-9a-zA-Z]|[\\s]|[=]|[\"]|[.]|[-]]+[?]>");

    private StringBuffer <SPAN class=hilite1>xml</SPAN>ContentBuffer;


    /* just used to store and output the data divided */
    <SPAN class=hilite1>XML</SPAN>OutputBuffer <SPAN class=hilite1>xml</SPAN>Output;

    private String charSetTitle = "";

    private String rootElemetMark = "";

    private String childElementMark = "";


    InputStreamReader bufferedReader;
    InputStream fileInputStream;


    public <SPAN class=hilite1>XML</SPAN>BufferTool(String <SPAN class=hilite1>xml</SPAN>FilePath) {

        this.<SPAN class=hilite1>xml</SPAN>ContentBuffer = new StringBuffer();

        try {

            this.fileInputStream = new FileInputStream(<SPAN class=hilite1>xml</SPAN>FilePath);
//             bufferedReader = new InputStreamReader(fileInputStream, "UTF-8");
            String charSet = getCharSet(<SPAN class=hilite1>xml</SPAN>FilePath);
            if (charSet != null)
                bufferedReader = new InputStreamReader(fileInputStream, charSet);
            else
                bufferedReader = new InputStreamReader(fileInputStream);
        } catch (FileNotFoundException fe) {
            fe.printStackTrace();
        } catch (UnsupportedEncodingException uee) {
            uee.printStackTrace();
        } catch (IOException ioe) {
            ioe.printStackTrace();
        }


        try {
            preparePaser();
        } catch (IOException ie) {
            ie.printStackTrace();
        }
    }


    public String getCharSetTitle() {
        return charSetTitle;
    }

    public String getRootElemetMark() {
        return rootElemetMark;
    }

    private String getCharSet(String filePath) throws IOException {
        char temp[] = new char[512];
        FileInputStream tempInput = new FileInputStream(filePath);
        InputStreamReader tempReader = new InputStreamReader(tempInput);

        int i = tempReader.read(temp);

        tempReader.close();
        tempInput.close();
        if (i < 0)
            return null;

        String tempStr = new String(temp);
        Matcher m = charSetPattern.matcher(tempStr);
        if (m.find()) {
            String charSetStr = tempStr.substring(m.start(), m.end());
            Pattern tempP = Pattern.compile("[\"][[0-9a-zA-Z]|[-]]+[\"]");
            Matcher tempM = tempP.matcher(charSetStr);
            if (tempM.find()) {
                String charSet = charSetStr.substring(tempM.start(), tempM.end());
                return charSet.substring(1, charSet.length() - 1);
            }
        }

        return null;
    }


    private void preparePaser() throws IOException {
        readSomeLine(defaultLineCount);
        Matcher m = charSetPattern.matcher(<SPAN class=hilite1>xml</SPAN>ContentBuffer);
        if (m.find()) {
            this.charSetTitle = this.<SPAN class=hilite1>xml</SPAN>ContentBuffer.substring(m.start(), m.end());
            this.<SPAN class=hilite1>xml</SPAN>ContentBuffer.delete(0, m.end());
        }

        m = elementPattern.matcher(<SPAN class=hilite1>xml</SPAN>ContentBuffer);
        if (m.find()) {
            this.rootElemetMark = this.<SPAN class=hilite1>xml</SPAN>ContentBuffer.substring(m.start(), m.end());
            this.<SPAN class=hilite1>xml</SPAN>ContentBuffer.delete(0, m.end());
        }

        m = elementPattern.matcher(<SPAN class=hilite1>xml</SPAN>ContentBuffer);
        if (m.find()) {
            this.childElementMark = this.<SPAN class=hilite1>xml</SPAN>ContentBuffer.substring(m.start(), m.end());
        }
        this.<SPAN class=hilite1>xml</SPAN>Output = new <SPAN class=hilite1>XML</SPAN>OutputBuffer(this.childElementMark);

        parserBuffer();
    }


    private int readSomeLine(int lineCount) throws IOException {

        char buffer[] = new char[1024];
        int i = 0;
        int index = 0;
        /* be careful of the sequence of the boolean caculation */
        while (i++ < lineCount && (index = this.bufferedReader.read(buffer)) > 0) {
            <SPAN class=hilite1>xml</SPAN>ContentBuffer.append(buffer, 0, index);
        }

        return index;

    }


    private void parserBuffer() {

        int lastIndex = this.<SPAN class=hilite1>xml</SPAN>ContentBuffer.lastIndexOf(this.childElementMark);

        if (lastIndex > 0) {
            this.<SPAN class=hilite1>xml</SPAN>Output.append(this.<SPAN class=hilite1>xml</SPAN>ContentBuffer.substring(0, lastIndex));
            this.<SPAN class=hilite1>xml</SPAN>ContentBuffer.delete(0, lastIndex);
        }
    }

    public StringBuffer popDividedDataAfterParser() throws IOException {

        while (this.<SPAN class=hilite1>xml</SPAN>Output.getItemCount() < defaultMaxOutputSize) {
            int i = readSomeLine(defaultLineCount);
            parserBuffer();
            if (i < 0)
                break;
        }

        if (this.<SPAN class=hilite1>xml</SPAN>Output.getItemCount() == 0)
            return null;

        StringBuffer returnSB = this.<SPAN class=hilite1>xml</SPAN>Output.get<SPAN class=hilite1>Xml</SPAN>Output();
        this.<SPAN class=hilite1>xml</SPAN>Output.clearBuffer();
        return returnSB.insert(0, this.rootElemetMark).append(this.rootElemetMark.replaceFirst("<", "</"));

    }


    public static void main(String args[]) throws Exception {
        String str = "F:/ringInfo<SPAN class=hilite1>XML</SPAN>/ringTime.<SPAN class=hilite1>xml</SPAN>";

        <SPAN class=hilite1>XML</SPAN>BufferTool <SPAN class=hilite1>xml</SPAN>b = new <SPAN class=hilite1>XML</SPAN>BufferTool(str);

        StringBuffer s = <SPAN class=hilite1>xml</SPAN>b.popDividedDataAfterParser();
        int i = 0;
        Matcher m = Pattern.compile("<ring>").matcher(s);
        while (m.find())
            i++;

        System.out.println(i);
        System.out.println(s);


    }

    private static class <SPAN class=hilite1>XML</SPAN>OutputBuffer {
        private StringBuffer <SPAN class=hilite1>xml</SPAN>Output;
        private int itemCount;

        private Pattern markPattern;

        <SPAN class=hilite1>XML</SPAN>OutputBuffer(String markStr) {
            this.markPattern = Pattern.compile(markStr);
            <SPAN class=hilite1>xml</SPAN>Output = new StringBuffer();
            itemCount = 0;
        }

        public void append(String str) {
            if (str == null || "".equals(str))
                return;
            this.<SPAN class=hilite1>xml</SPAN>Output.append(str);
            Matcher m = this.markPattern.matcher(str);
            while (m.find())
                this.itemCount++;
        }

        public void clearBuffer() {
            <SPAN class=hilite1>xml</SPAN>Output = new StringBuffer();
            this.itemCount = 0;
        }

        public StringBuffer get<SPAN class=hilite1>Xml</SPAN>Output() {
            return <SPAN class=hilite1>xml</SPAN>Output;
        }

        public int getItemCount() {
            return itemCount;
        }
    }


}

代码中popDividedDataAfterParser() 输出的StringBuffer 可用来初始化一个 StringReader 再给dom4j 的saxReader去解析,这样联合一起用, 想处理多少,就先分出来解析多少,特别适合多线程的生产者和消费者的那种情况,希望对大家有用。
 

No comments: