001// License: GPL. For details, see LICENSE file.
002package org.openstreetmap.josm.io;
003
004import java.io.IOException;
005import java.io.Reader;
006import java.util.Arrays;
007
008import org.openstreetmap.josm.tools.Logging;
009
010/**
011 * FilterInputStream that gets rid of characters that are invalid in an XML 1.0
012 * document.
013 *
014 * Although these characters are forbidden, in the real wold they still appear
015 * in XML files. Java's SAX parser throws an exception, so we have to filter
016 * at a lower level.
017 *
018 * Only handles control characters (<0x20). Invalid characters are replaced
019 * by space (0x20).
020 */
021public class InvalidXmlCharacterFilter extends Reader {
022
023    private final Reader reader;
024
025    private static boolean firstWarning = true;
026
027    private static final boolean[] INVALID_CHARS;
028
029    static {
030        INVALID_CHARS = new boolean[0x20];
031        Arrays.fill(INVALID_CHARS, true);
032        INVALID_CHARS[0x9] = false; // tab
033        INVALID_CHARS[0xA] = false; // LF
034        INVALID_CHARS[0xD] = false; // CR
035    }
036
037    /**
038     * Constructs a new {@code InvalidXmlCharacterFilter} for the given Reader.
039     * @param reader The reader to filter
040     */
041    public InvalidXmlCharacterFilter(Reader reader) {
042        this.reader = reader;
043    }
044
045    @Override
046    public int read(char[] b, int off, int len) throws IOException {
047        int n = reader.read(b, off, len);
048        if (n == -1) {
049            return -1;
050        }
051        for (int i = off; i < off + n; ++i) {
052            b[i] = filter(b[i]);
053        }
054        return n;
055    }
056
057    @Override
058    public void close() throws IOException {
059        reader.close();
060    }
061
062    private static char filter(char in) {
063        if (in < 0x20 && INVALID_CHARS[in]) {
064            if (firstWarning) {
065                Logging.warn("Invalid xml character encountered: '"+in+"'.");
066                firstWarning = false;
067            }
068            return 0x20;
069        }
070        return in;
071    }
072}