package org.archive.extract;

import com.google.common.io.ByteStreams;
import com.google.common.io.CountingOutputStream;
import java.io.IOException;
import java.io.PrintWriter;
import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.List;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.solr.response.RawResponseWriter;
import org.archive.format.ArchiveFileConstants;
import org.archive.format.arc.ARCConstants;
import org.archive.format.gzip.GZIPFormatException;
import org.archive.format.json.JSONUtils;
import org.archive.format.json.SimpleJSONPathSpec;
import org.archive.resource.MetaData;
import org.archive.resource.Resource;
import org.archive.resource.ResourceConstants;
import org.archive.url.URLKeyMaker;
import org.archive.url.UsableURIFactory;
import org.archive.url.WaybackURLKeyMaker;
import org.archive.util.IAUtils;
import org.archive.util.StreamCopy;
import org.archive.wayback.core.CaptureSearchResult;
import org.archive.wayback.resourceindex.cdx.CDXFormatIndex;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import org.jwat.warc.WarcConstants;

/* loaded from: input_file:WEB-INF/lib/webarchive-commons-1.1.3.jar:org/archive/extract/RealCDXExtractorOutput.class */
public class RealCDXExtractorOutput implements ExtractorOutput {
    public static final String X_ROBOTS_HTTP_HEADER = "X-Robots-Tag";
    private PrintWriter out;
    SimpleJSONPathSpec filenameSpec;
    SimpleJSONPathSpec offsetSpec;
    SimpleJSONPathSpec gzDeflateLengthSpec;
    SimpleJSONPathSpec formatSpec;
    SimpleJSONPathSpec arcURL;
    SimpleJSONPathSpec arcDate;
    SimpleJSONPathSpec arcContentType;
    SimpleJSONPathSpec warcURL;
    SimpleJSONPathSpec warcDate;
    SimpleJSONPathSpec warcType;
    SimpleJSONPathSpec warcContentType;
    SimpleJSONPathSpec envBlockDigest;
    SimpleJSONPathSpec warcPayloadDigest;
    SimpleJSONPathSpec httpResponseCode;
    SimpleJSONPathSpec httpEntityDigest;
    SimpleJSONPathSpec HTTPLocation;
    private boolean dumpJSON;
    private URLKeyMaker keyMaker;
    private static final Logger LOG = Logger.getLogger(RealCDXExtractorOutput.class.getName());
    private static final Pattern refreshURLPattern = Pattern.compile("^\\d+\\s*;\\s*url\\s*=\\s*(.+?)\\s*$", 10);
    private static String NO_NOTHIN_MATCH = "NONE";
    private static String NO_FOLLOW_MATCH = "NOFOLLOW";
    private static String NO_INDEX_MATCH = "NOINDEX";
    private static String NO_ARCHIVE_MATCH = "NOARCHIVE";

    public RealCDXExtractorOutput(PrintWriter printWriter, URLKeyMaker uRLKeyMaker) {
        this.filenameSpec = new SimpleJSONPathSpec("Container.Filename");
        this.offsetSpec = new SimpleJSONPathSpec("Container.Offset");
        this.gzDeflateLengthSpec = new SimpleJSONPathSpec("Container.Gzip-Metadata.Deflate-Length");
        this.formatSpec = new SimpleJSONPathSpec("Envelope.Format");
        this.arcURL = new SimpleJSONPathSpec("Envelope.ARC-Header-Metadata.Target-URI");
        this.arcDate = new SimpleJSONPathSpec("Envelope.ARC-Header-Metadata.Date");
        this.arcContentType = new SimpleJSONPathSpec("Envelope.ARC-Header-Metadata.Content-Type");
        this.warcURL = new SimpleJSONPathSpec("Envelope.WARC-Header-Metadata.WARC-Target-URI");
        this.warcDate = new SimpleJSONPathSpec("Envelope.WARC-Header-Metadata.WARC-Date");
        this.warcType = new SimpleJSONPathSpec("Envelope.WARC-Header-Metadata.WARC-Type");
        this.warcContentType = new SimpleJSONPathSpec("Envelope.WARC-Header-Metadata.Content-Type");
        this.envBlockDigest = new SimpleJSONPathSpec("Envelope.Block-Digest");
        this.warcPayloadDigest = new SimpleJSONPathSpec("Envelope.WARC-Header-Metadata.WARC-Payload-Digest");
        this.httpResponseCode = new SimpleJSONPathSpec("Envelope.Payload-Metadata.HTTP-Response-Metadata.Response-Message.Status");
        this.httpEntityDigest = new SimpleJSONPathSpec("Envelope.Payload-Metadata.HTTP-Response-Metadata.Entity-Digest");
        this.HTTPLocation = new SimpleJSONPathSpec("Envelope.Payload-Metadata.HTTP-Response-Metadata.Headers");
        this.dumpJSON = false;
        this.out = printWriter;
        this.keyMaker = uRLKeyMaker;
        printWriter.println(CDXFormatIndex.CDX_HEADER_MAGIC_NEW);
        printWriter.flush();
    }

    public RealCDXExtractorOutput(PrintWriter printWriter) {
        this(printWriter, new WaybackURLKeyMaker());
    }

    @Override // org.archive.extract.ExtractorOutput
    public void output(Resource resource) throws IOException {
        CountingOutputStream countingOutputStream = new CountingOutputStream(ByteStreams.nullOutputStream());
        try {
            StreamCopy.copy(resource.getInputStream(), countingOutputStream);
            long count = countingOutputStream.getCount();
            if (count > 0) {
                LOG.info(count + " unconsumed bytes in Resource InputStream.");
            }
            try {
                MetaData topMetaData = resource.getMetaData().getTopMetaData();
                String containerFilename = getContainerFilename(topMetaData);
                String containerOffset = getContainerOffset(topMetaData);
                String gZLength = getGZLength(topMetaData);
                String envelopeFormat = getEnvelopeFormat(topMetaData);
                String str = "TBD";
                String str2 = "TBD";
                String str3 = "TBD";
                String str4 = "TBD";
                String str5 = "TBD";
                String str6 = "TBD";
                String str7 = "TBD";
                if (envelopeFormat.equals(ResourceConstants.ENVELOPE_FORMAT_WARC)) {
                    str = getWARCURL(topMetaData);
                    str2 = getWARCDate(topMetaData);
                    String wARCType = getWARCType(topMetaData);
                    if (wARCType.equals("response")) {
                        String wARCContentType = getWARCContentType(topMetaData);
                        if (wARCContentType.equals("text/dns")) {
                            str7 = "-";
                            str6 = "-";
                            str4 = "-";
                            str3 = wARCContentType;
                            str5 = getEnvelopeBlockDigest(topMetaData);
                        } else if (wARCContentType.equals("application/http; msgtype=response")) {
                            str4 = getHTTPStatus(topMetaData);
                            str5 = getHTTPEntityDigest(topMetaData);
                            JSONObject extractObject = JSONUtils.extractObject(topMetaData, "Envelope.Payload-Metadata.HTTP-Response-Metadata.Headers");
                            str3 = normalizeHTTPMime(scanHeadersLC(extractObject, ArchiveFileConstants.MIMETYPE_FIELD_KEY, "unk"));
                            str7 = scanHeadersLC(extractObject, ARCConstants.LOCATION_HEADER_FIELD_KEY, "-");
                            String scanHeadersLC = scanHeadersLC(extractObject, "X-Robots-Tag", null);
                            str6 = scanHeadersLC != null ? parseRobotInstructions(scanHeadersLC) : "-";
                            if (str3.toLowerCase().contains("html")) {
                                if (str7.equals("-")) {
                                    str7 = extractHTMLMetaRefresh(str, topMetaData);
                                }
                                if (str6.equals("-")) {
                                    str6 = extractHTMLRobots(topMetaData);
                                }
                            }
                        }
                    } else if (wARCType.equals(WarcConstants.RT_WARCINFO)) {
                        str = "warcinfo:/" + containerFilename + "/" + IAUtils.COMMONS_VERSION.replaceAll(" ", "_");
                        str7 = "-";
                        str6 = "-";
                        str4 = "-";
                        str3 = "warc-info";
                        str5 = getEnvelopeBlockDigest(topMetaData);
                    } else if (wARCType.equals("request")) {
                        str3 = "warc/request";
                        str7 = "-";
                        str6 = "-";
                        str4 = "-";
                        str5 = getEnvelopeBlockDigest(topMetaData);
                    } else if (wARCType.equals(WarcConstants.RT_METADATA)) {
                        str3 = "warc/metadata";
                        str7 = "-";
                        str6 = "-";
                        str4 = "-";
                        str5 = getEnvelopeBlockDigest(topMetaData);
                    } else if (wARCType.equals(WarcConstants.RT_REVISIT)) {
                        str3 = "warc/revisit";
                        str7 = "-";
                        str6 = "-";
                        str4 = "-";
                        str5 = getWARCPayloadDigest(topMetaData);
                    }
                } else if (envelopeFormat.equals(ResourceConstants.ENVELOPE_FORMAT_ARC)) {
                    str = getARCURL(topMetaData);
                    str2 = getARCDate(topMetaData);
                    if (str.startsWith("filedesc:")) {
                        str = ARCConstants.FILEDESC_SCHEME + containerFilename + "/" + IAUtils.COMMONS_VERSION.replaceAll(" ", "_");
                        str3 = "arc-filedesc";
                        str7 = "-";
                        str6 = "-";
                        str4 = "-";
                        str5 = getEnvelopeBlockDigest(topMetaData);
                    } else {
                        str3 = getARCContentType(topMetaData);
                        if (str3.equals("alexa/dat")) {
                            str7 = "-";
                            str6 = "-";
                            str4 = "-";
                            str5 = getEnvelopeBlockDigest(topMetaData);
                        } else {
                            str4 = getHTTPStatus(topMetaData);
                            str5 = getHTTPEntityDigest(topMetaData);
                            JSONObject extractObject2 = JSONUtils.extractObject(topMetaData, "Envelope.Payload-Metadata.HTTP-Response-Metadata.Headers");
                            str3 = normalizeHTTPMime(scanHeadersLC(extractObject2, ArchiveFileConstants.MIMETYPE_FIELD_KEY, "unk"));
                            str7 = scanHeadersLC(extractObject2, ARCConstants.LOCATION_HEADER_FIELD_KEY, "-");
                            String scanHeadersLC2 = scanHeadersLC(extractObject2, "X-Robots-Tag", null);
                            str6 = scanHeadersLC2 != null ? parseRobotInstructions(scanHeadersLC2) : "-";
                            if (str3.toLowerCase().contains("html")) {
                                if (str7.equals("-")) {
                                    str7 = extractHTMLMetaRefresh(str, topMetaData);
                                }
                                if (str6.equals("-")) {
                                    str6 = extractHTMLRobots(topMetaData);
                                }
                            }
                        }
                    }
                }
                if (!str7.equals("-")) {
                    str7 = resolve(str, str7);
                }
                String makeKey = this.keyMaker.makeKey(str);
                if (this.dumpJSON) {
                    this.out.format("%s %s %s %s %s %s %s %s %s %s %s %s\n", makeKey, str2, str, str3, str4, str5, str7, str6, gZLength, containerOffset, containerFilename, topMetaData.toString(1));
                } else {
                    this.out.format("%s %s %s %s %s %s %s %s %s %s %s\n", makeKey, str2, str, str3, str4, str5, str7, str6, gZLength, containerOffset, containerFilename);
                }
                this.out.flush();
            } catch (URISyntaxException e) {
                throw new IOException(e);
            } catch (JSONException e2) {
                throw new IOException(e2);
            }
        } catch (GZIPFormatException e3) {
            e3.printStackTrace();
        }
    }

    private String extractHTMLRobots(MetaData metaData) {
        String scanHeadersLC;
        String scanHeadersLC2;
        JSONArray extractArray = JSONUtils.extractArray(metaData, "Envelope.Payload-Metadata.HTTP-Response-Metadata.HTML-Metadata.Head.Metas");
        if (extractArray == null) {
            return "-";
        }
        int length = extractArray.length();
        for (int i = 0; i < length; i++) {
            JSONObject optJSONObject = extractArray.optJSONObject(i);
            if (optJSONObject != null && (scanHeadersLC = scanHeadersLC(optJSONObject, "name", null)) != null && scanHeadersLC.toLowerCase().equals("robots") && (scanHeadersLC2 = scanHeadersLC(optJSONObject, RawResponseWriter.CONTENT, null)) != null) {
                return parseRobotInstructions(scanHeadersLC2);
            }
        }
        return "-";
    }

    private String extractHTMLMetaRefresh(String str, MetaData metaData) {
        String scanHeadersLC;
        String scanHeadersLC2;
        String parseMetaRefreshContent;
        JSONArray extractArray = JSONUtils.extractArray(metaData, "Envelope.Payload-Metadata.HTTP-Response-Metadata.HTML-Metadata.Head.Metas");
        if (extractArray == null) {
            return "-";
        }
        int length = extractArray.length();
        for (int i = 0; i < length; i++) {
            JSONObject optJSONObject = extractArray.optJSONObject(i);
            if (optJSONObject != null && (scanHeadersLC = scanHeadersLC(optJSONObject, "http-equiv", null)) != null && scanHeadersLC.toLowerCase().equals("refresh") && (scanHeadersLC2 = scanHeadersLC(optJSONObject, RawResponseWriter.CONTENT, null)) != null && (parseMetaRefreshContent = parseMetaRefreshContent(scanHeadersLC2)) != null) {
                return parseMetaRefreshContent;
            }
        }
        return "-";
    }

    private String resolve(String str, String str2) {
        try {
            return new URL(new URL(str), str2).toURI().toASCIIString();
        } catch (NullPointerException | MalformedURLException | URISyntaxException e) {
            return str2;
        }
    }

    private String scanHeadersLC(JSONObject jSONObject, String str, String str2) {
        if (jSONObject != null) {
            if (jSONObject.length() == 0) {
                return str2;
            }
            String trim = str.toLowerCase().trim();
            for (String str3 : JSONObject.getNames(jSONObject)) {
                if (trim.equals(str3.toLowerCase().trim())) {
                    try {
                        return jSONObject.getString(str3).trim();
                    } catch (JSONException e) {
                        e.printStackTrace();
                        return str2;
                    }
                }
            }
        }
        return str2;
    }

    private String getContainerFilename(MetaData metaData) {
        return unwrapFirst(this.filenameSpec.extract(metaData), "-");
    }

    private String getContainerOffset(MetaData metaData) {
        return unwrapFirst(this.offsetSpec.extract(metaData), "-");
    }

    private String getGZLength(MetaData metaData) {
        return unwrapFirst(this.gzDeflateLengthSpec.extract(metaData), "-");
    }

    private String getEnvelopeFormat(MetaData metaData) {
        return unwrapFirst(this.formatSpec.extract(metaData), "-");
    }

    private String getWARCURL(MetaData metaData) {
        return unwrapFirst(this.warcURL.extract(metaData), "-");
    }

    private String getWARCDate(MetaData metaData) {
        return normalizeWARCDate(unwrapFirst(this.warcDate.extract(metaData), "-"));
    }

    private String getWARCType(MetaData metaData) {
        return unwrapFirst(this.warcType.extract(metaData), "-");
    }

    private String getWARCPayloadDigest(MetaData metaData) {
        return normalizeSHA1(unwrapFirst(this.warcPayloadDigest.extract(metaData), "-"));
    }

    private String getHTTPStatus(MetaData metaData) {
        return unwrapFirst(this.httpResponseCode.extract(metaData), "-");
    }

    private String getWARCContentType(MetaData metaData) {
        return unwrapFirst(this.warcContentType.extract(metaData), "-");
    }

    private String getEnvelopeBlockDigest(MetaData metaData) {
        return normalizeSHA1(unwrapFirst(this.envBlockDigest.extract(metaData), "-"));
    }

    private String getHTTPEntityDigest(MetaData metaData) {
        return normalizeSHA1(unwrapFirst(this.httpEntityDigest.extract(metaData), "-"));
    }

    private String getARCURL(MetaData metaData) {
        return unwrapFirst(this.arcURL.extract(metaData), "-");
    }

    private String getARCDate(MetaData metaData) {
        return unwrapFirst(this.arcDate.extract(metaData), "-");
    }

    private String getARCContentType(MetaData metaData) {
        return normalizeHTTPMime(unwrapFirst(this.arcContentType.extract(metaData), "-"));
    }

    public String normalizeSHA1(String str) {
        return str.startsWith("sha1:") ? str.substring(5) : str;
    }

    public String normalizeWARCDate(String str) {
        return str == null ? "-" : str.length() != 20 ? str : new String(new char[]{str.charAt(0), str.charAt(1), str.charAt(2), str.charAt(3), str.charAt(5), str.charAt(6), str.charAt(8), str.charAt(9), str.charAt(11), str.charAt(12), str.charAt(14), str.charAt(15), str.charAt(17), str.charAt(18)});
    }

    private String escapeSpaces(String str) {
        return str.contains(" ") ? str.replace(" ", UsableURIFactory.ESCAPED_SPACE) : str;
    }

    public String normalizeHTTPMime(String str) {
        if (str == null) {
            return null;
        }
        int indexOf = str.indexOf(";");
        return indexOf > 0 ? escapeSpaces(str.substring(0, indexOf).trim()) : escapeSpaces(str.trim());
    }

    private String unwrapFirst(List<List<String>> list, String str) {
        String str2;
        return (list == null || list.size() <= 0 || list.get(0) == null || list.get(0).size() <= 0 || (str2 = list.get(0).get(0)) == null || str2.length() <= 0) ? str : str2;
    }

    private String parseRobotInstructions(String str) {
        if (str == null) {
            return "-";
        }
        String upperCase = str.replaceAll("-", "").toUpperCase();
        StringBuilder sb = new StringBuilder(3);
        if (upperCase.contains(NO_FOLLOW_MATCH)) {
            sb.append(CaptureSearchResult.CAPTURE_ROBOT_NOFOLLOW);
        }
        if (upperCase.contains(NO_ARCHIVE_MATCH)) {
            sb.append(CaptureSearchResult.CAPTURE_ROBOT_NOARCHIVE);
        }
        if (upperCase.contains(NO_INDEX_MATCH)) {
            sb.append(CaptureSearchResult.CAPTURE_ROBOT_NOINDEX);
        }
        if (upperCase.contains(NO_NOTHIN_MATCH)) {
            sb.setLength(0);
            sb.append("AIF");
        }
        return sb.length() == 0 ? "-" : sb.toString();
    }

    private String parseMetaRefreshContent(String str) {
        Matcher matcher = refreshURLPattern.matcher(str);
        return (matcher.matches() && matcher.groupCount() == 1) ? matcher.group(1) : "-";
    }
}
