package crawlercommons.sitemaps;

import crawlercommons.sitemaps.AbstractSiteMap;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.zip.GZIPInputStream;
import javax.xml.parsers.DocumentBuilderFactory;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.BOMInputStream;
import org.apache.tika.Tika;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MediaTypeRegistry;
import org.apache.tika.parser.AutoDetectParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;

/* loaded from: input_file:crawlercommons/sitemaps/SiteMapParser.class */
public class SiteMapParser {
    private static final int MAX_URLS = 50000;
    private static MediaTypeRegistry mediaTypeRegistry;
    private static Tika tika;
    private boolean strict;
    public static final Logger LOG = LoggerFactory.getLogger(SiteMapParser.class);
    public static int MAX_BYTES_ALLOWED = 10485760;
    private static final List<MediaType> XML_MEDIA_TYPES = new ArrayList();
    private static final List<MediaType> TEXT_MEDIA_TYPES = new ArrayList();
    private static final List<MediaType> GZ_MEDIA_TYPES = new ArrayList();

    public SiteMapParser() {
        this(true);
    }

    public SiteMapParser(boolean z) {
        this.strict = z;
    }

    public boolean isStrict() {
        return this.strict;
    }

    public AbstractSiteMap parseSiteMap(URL url) throws UnknownFormatException, IOException {
        if (url == null) {
            return null;
        }
        if (tika == null) {
            tika = new Tika();
        }
        byte[] byteArray = IOUtils.toByteArray(url);
        return parseSiteMap(tika.detect(byteArray, FilenameUtils.getName(url.getPath())), byteArray, url);
    }

    public AbstractSiteMap parseSiteMap(String str, byte[] bArr, AbstractSiteMap abstractSiteMap) throws UnknownFormatException, IOException {
        AbstractSiteMap parseSiteMap = parseSiteMap(str, bArr, abstractSiteMap.getUrl());
        parseSiteMap.setLastModified(abstractSiteMap.getLastModified());
        abstractSiteMap.setProcessed(true);
        return parseSiteMap;
    }

    public AbstractSiteMap parseSiteMap(String str, byte[] bArr, URL url) throws UnknownFormatException, IOException {
        MediaType parse = MediaType.parse(str);
        if (parse == null || parse.equals(MediaType.OCTET_STREAM)) {
            throw new UnknownFormatException("Can't parse sitemap with MediaType of: " + str + " (at: " + url + ")");
        }
        return XML_MEDIA_TYPES.contains(parse) ? processXml(url, bArr) : TEXT_MEDIA_TYPES.contains(parse) ? processText(url.toString(), bArr) : GZ_MEDIA_TYPES.contains(parse) ? processGzip(url, bArr) : parseSiteMap(mediaTypeRegistry.getSupertype(parse).toString(), bArr, url);
    }

    private AbstractSiteMap processXml(URL url, byte[] bArr) throws UnknownFormatException {
        BOMInputStream bOMInputStream = new BOMInputStream(new ByteArrayInputStream(bArr));
        InputSource inputSource = new InputSource();
        inputSource.setCharacterStream(new BufferedReader(new InputStreamReader(bOMInputStream)));
        return processXml(url, inputSource);
    }

    private SiteMap processText(String str, byte[] bArr) throws IOException {
        LOG.debug("Processing textual Sitemap");
        SiteMap siteMap = new SiteMap(str);
        siteMap.setType(AbstractSiteMap.SitemapType.TEXT);
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new BOMInputStream(new ByteArrayInputStream(bArr))));
        int i = 1;
        while (true) {
            String readLine = bufferedReader.readLine();
            if (readLine == null) {
                siteMap.setProcessed(true);
                return siteMap;
            }
            if (readLine.length() > 0 && i <= MAX_URLS) {
                try {
                    URL url = new URL(readLine);
                    boolean urlIsLegal = urlIsLegal(siteMap.getBaseUrl(), url.toString());
                    if (urlIsLegal || !this.strict) {
                        int i2 = i;
                        i++;
                        LOG.debug("  {}. {}", Integer.valueOf(i2), url);
                        siteMap.addSiteMapUrl(new SiteMapURL(url, urlIsLegal));
                    }
                } catch (MalformedURLException e) {
                    LOG.warn("Bad URL [{}]. From Sitemap: [{}]", readLine, str);
                }
            }
        }
    }

    private AbstractSiteMap processGzip(URL url, byte[] bArr) throws MalformedURLException, IOException, UnknownFormatException {
        LOG.debug("Processing gzip");
        ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(bArr);
        String replaceFirst = url.toString().replaceFirst("\\.gz$", "");
        LOG.debug("XML url = {}", replaceFirst);
        BOMInputStream bOMInputStream = new BOMInputStream(new GZIPInputStream(byteArrayInputStream));
        InputSource inputSource = new InputSource((InputStream) bOMInputStream);
        inputSource.setSystemId(replaceFirst);
        AbstractSiteMap processXml = processXml(url, inputSource);
        bOMInputStream.close();
        return processXml;
    }

    private AbstractSiteMap processXml(URL url, InputSource inputSource) throws UnknownFormatException {
        try {
            Document parse = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(inputSource);
            if (parse.getElementsByTagName("sitemapindex").getLength() > 0) {
                return parseSitemapIndex(url, parse.getElementsByTagName("sitemap"));
            }
            if (parse.getElementsByTagName("urlset").getLength() > 0) {
                return parseXmlSitemap(url, parse);
            }
            if (parse.getElementsByTagName("link").getLength() > 0) {
                return parseSyndicationFormat(url, parse);
            }
            throw new UnknownFormatException("Unknown XML format for: " + url);
        } catch (Exception e) {
            LOG.debug(e.toString());
            throw new UnknownFormatException("Error parsing XML for: " + url);
        }
    }

    private SiteMap parseXmlSitemap(URL url, Document document) {
        SiteMap siteMap = new SiteMap(url);
        siteMap.setType(AbstractSiteMap.SitemapType.XML);
        NodeList elementsByTagName = document.getElementsByTagName("url");
        for (int i = 0; i < elementsByTagName.getLength(); i++) {
            Node item = elementsByTagName.item(i);
            if (item.getNodeType() == 1) {
                Element element = (Element) item;
                String elementValue = getElementValue(element, "loc");
                try {
                    URL url2 = new URL(elementValue);
                    String elementValue2 = getElementValue(element, "lastmod");
                    String elementValue3 = getElementValue(element, "changefreq");
                    String elementValue4 = getElementValue(element, "priority");
                    boolean urlIsLegal = urlIsLegal(siteMap.getBaseUrl(), url2.toString());
                    if (urlIsLegal || !this.strict) {
                        SiteMapURL siteMapURL = new SiteMapURL(url2.toString(), elementValue2, elementValue3, elementValue4, urlIsLegal);
                        siteMap.addSiteMapUrl(siteMapURL);
                        LOG.debug("  {}. {}", Integer.valueOf(i + 1), siteMapURL);
                    }
                } catch (MalformedURLException e) {
                    LOG.debug("Bad url: [{}]", elementValue);
                    LOG.trace("Can't create an entry with a bad URL", e);
                }
            }
        }
        siteMap.setProcessed(true);
        return siteMap;
    }

    private SiteMapIndex parseSitemapIndex(URL url, NodeList nodeList) {
        LOG.debug("Parsing Sitemap Index");
        SiteMapIndex siteMapIndex = new SiteMapIndex(url);
        siteMapIndex.setType(AbstractSiteMap.SitemapType.INDEX);
        for (int i = 0; i < nodeList.getLength() && i < MAX_URLS; i++) {
            Node item = nodeList.item(i);
            if (item.getNodeType() == 1) {
                Element element = (Element) item;
                String elementValue = getElementValue(element, "loc");
                if (elementValue == null) {
                    elementValue = element.getTextContent().trim();
                }
                try {
                    SiteMap siteMap = new SiteMap(new URL(elementValue), SiteMap.convertToDate(getElementValue(element, "lastmod")));
                    siteMapIndex.addSitemap(siteMap);
                    LOG.debug("  {}. {}", Integer.valueOf(i + 1), siteMap);
                } catch (MalformedURLException e) {
                    LOG.trace("Don't create an entry with a bad URL", e);
                    LOG.debug("Bad url: [{}]", elementValue);
                }
            }
        }
        siteMapIndex.setProcessed(true);
        return siteMapIndex;
    }

    private SiteMap parseSyndicationFormat(URL url, Document document) throws UnknownFormatException {
        SiteMap siteMap = new SiteMap(url);
        NodeList elementsByTagName = document.getElementsByTagName("feed");
        if (elementsByTagName.getLength() > 0) {
            parseAtom(siteMap, (Element) elementsByTagName.item(0), document);
            siteMap.setProcessed(true);
            return siteMap;
        }
        if (document.getElementsByTagName("rss").getLength() <= 0) {
            throw new UnknownFormatException("Unknown syndication format at " + url);
        }
        parseRSS(siteMap, document);
        siteMap.setProcessed(true);
        return siteMap;
    }

    private void parseAtom(SiteMap siteMap, Element element, Document document) {
        LOG.debug("Parsing Atom XML");
        siteMap.setType(AbstractSiteMap.SitemapType.ATOM);
        String elementValue = getElementValue(element, "modified");
        LOG.debug("lastMod = {}", elementValue);
        NodeList elementsByTagName = document.getElementsByTagName("entry");
        for (int i = 0; i < elementsByTagName.getLength() && i < MAX_URLS; i++) {
            Node item = elementsByTagName.item(i);
            if (item.getNodeType() == 1) {
                String elementAttributeValue = getElementAttributeValue((Element) item, "link", "href");
                LOG.debug("href = {}", elementAttributeValue);
                try {
                    URL url = new URL(elementAttributeValue);
                    boolean urlIsLegal = urlIsLegal(siteMap.getBaseUrl(), url.toString());
                    if (urlIsLegal || !this.strict) {
                        SiteMapURL siteMapURL = new SiteMapURL(url.toString(), elementValue, (String) null, (String) null, urlIsLegal);
                        siteMap.addSiteMapUrl(siteMapURL);
                        LOG.debug("  {}. {}", Integer.valueOf(i + 1), siteMapURL);
                    }
                } catch (MalformedURLException e) {
                    LOG.trace("Can't create an entry with a bad URL", e);
                    LOG.debug("Bad url: [{}]", elementAttributeValue);
                }
            }
        }
    }

    private void parseRSS(SiteMap siteMap, Document document) {
        LOG.debug("Parsing RSS doc");
        siteMap.setType(AbstractSiteMap.SitemapType.RSS);
        String elementValue = getElementValue((Element) document.getElementsByTagName("channel").item(0), "pubDate");
        LOG.debug("lastMod = ", elementValue);
        NodeList elementsByTagName = document.getElementsByTagName("item");
        for (int i = 0; i < elementsByTagName.getLength() && i < MAX_URLS; i++) {
            Node item = elementsByTagName.item(i);
            if (item.getNodeType() == 1) {
                String elementValue2 = getElementValue((Element) item, "link");
                LOG.debug("link = {}", elementValue2);
                try {
                    URL url = new URL(elementValue2);
                    boolean urlIsLegal = urlIsLegal(siteMap.getBaseUrl(), url.toString());
                    if (urlIsLegal || !this.strict) {
                        SiteMapURL siteMapURL = new SiteMapURL(url.toString(), elementValue, (String) null, (String) null, urlIsLegal);
                        siteMap.addSiteMapUrl(siteMapURL);
                        LOG.debug("  {}. {}", Integer.valueOf(i + 1), siteMapURL);
                    }
                } catch (MalformedURLException e) {
                    LOG.trace("Can't create an entry with a bad URL", e);
                    LOG.debug("Bad url: [{}]", elementValue2);
                }
            }
        }
    }

    private String getElementValue(Element element, String str) {
        Element element2 = (Element) element.getElementsByTagName(str).item(0);
        if (element2 == null) {
            return null;
        }
        NodeList childNodes = element2.getChildNodes();
        if (childNodes.item(0) != null) {
            return childNodes.item(0).getNodeValue().trim();
        }
        return null;
    }

    private String getElementAttributeValue(Element element, String str, String str2) {
        Element element2 = (Element) element.getElementsByTagName(str).item(0);
        if (element2 != null) {
            return element2.getAttribute(str2);
        }
        return null;
    }

    private boolean urlIsLegal(String str, String str2) {
        boolean z = false;
        if (str != null && str.length() <= str2.length()) {
            z = str.equals(str2.substring(0, str.length()).toLowerCase());
        }
        LOG.trace("urlIsLegal: {}  <= {}  ? {}", new Object[]{str, str2, Boolean.valueOf(z)});
        return z;
    }

    private static void initMediaTypes() {
        mediaTypeRegistry = new AutoDetectParser().getMediaTypeRegistry();
        XML_MEDIA_TYPES.add(MediaType.APPLICATION_XML);
        XML_MEDIA_TYPES.addAll(mediaTypeRegistry.getAliases(MediaType.APPLICATION_XML));
        TEXT_MEDIA_TYPES.add(MediaType.TEXT_PLAIN);
        TEXT_MEDIA_TYPES.addAll(mediaTypeRegistry.getAliases(MediaType.TEXT_PLAIN));
        MediaType parse = MediaType.parse("application/gzip");
        GZ_MEDIA_TYPES.add(parse);
        GZ_MEDIA_TYPES.addAll(mediaTypeRegistry.getAliases(parse));
    }

    static {
        initMediaTypes();
    }
}
