package org.archive.wayback.util.url;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.configuration.tree.DefaultExpressionEngine;
import org.apache.commons.httpclient.URIException;
import org.apache.log4j.spi.LocationInfo;
import org.archive.format.warc.WARCConstants;
import org.archive.url.UsableURI;
import org.archive.url.UsableURIFactory;
import org.archive.wayback.UrlCanonicalizer;
import org.archive.wayback.util.ByteOp;

/* loaded from: input_file:WEB-INF/lib/openwayback-core-2.0.0.BETA.1.jar:org/archive/wayback/util/url/AggressiveUrlCanonicalizer.class */
public class AggressiveUrlCanonicalizer implements UrlCanonicalizer {
    private static final String CDX_PREFIX = " CDX ";
    private static final String STRIP_USERINFO_CHOOSER = "@";
    private static final String STRIP_ASPSESSION2_CHOOSER = ".aspx";
    private static final String STRIP_ASPSESSION3_CHOOSER = ".aspx";
    private static final Logger LOGGER = Logger.getLogger(AggressiveUrlCanonicalizer.class.getName());
    private static final Pattern STRIP_WWW_REGEX = Pattern.compile("(?i)^(?:https?://)(www[0-9]*\\.)(?:[^/]*/.+)$");
    private static final Pattern STRIP_USERINFO_REGEX = Pattern.compile("^(?:(?:(?:https?)|(?:ftps?))://)([^/]+@)(?:.*)$", 2);
    private static final Pattern STRIP_PHPSESSION_ID_REGEX = Pattern.compile("^(?:.+)(phpsessid=[0-9a-zA-Z]{32}&?)(?:(?:.*))?$", 2);
    private static final Pattern STRIP_JSESSION_ID_REGEX = Pattern.compile("^.*(jsessionid=[0-9a-zA-Z]{32}&?).*$", 2);
    private static final Pattern STRIP_SID_REGEX = Pattern.compile("^(?:.+)(sid=[0-9a-zA-Z]{32}&?)(?:(?:.*))?$", 2);
    private static final Pattern STRIP_ASPSESSION_REGEX = Pattern.compile("^(?:.+)(ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24}&?)(?:(?:.*))?$", 2);
    private static final Pattern STRIP_ASPSESSION2_REGEX = Pattern.compile(".*/(\\([0-9a-z]{24}\\)/)(?:[^\\?]+\\.aspx.*)$", 2);
    private static final Pattern STRIP_ASPSESSION3_REGEX = Pattern.compile(".*/(\\((?:[a-z]\\([0-9a-z]{24}\\))+\\)/)[^\\?]+\\.aspx.*$", 2);
    private static final Pattern STRIP_CFSESSION_REGEX = Pattern.compile(".+(cfid=[^&]+&cftoken=[^&]+(?:&jsessionid=[^&]+)?&?).*$", 2);
    private static final String STRIP_WWW_CHOOSER = "/www";
    private static final String STRIP_PHPSESSION_ID_CHOOSER = "phpsessid=";
    private static final String STRIP_JSESSION_ID_CHOOSER = "jsessionid=";
    private static final String STRIP_ASPSESSION_CHOOSER = "aspsessionid";
    private static final String STRIP_SID_CHOOSER = "sid=";
    private static final String STRIP_CFSESSION_CHOOSER = "cftoken=";
    private static final String[] choosers = {"@", STRIP_WWW_CHOOSER, STRIP_PHPSESSION_ID_CHOOSER, STRIP_JSESSION_ID_CHOOSER, STRIP_ASPSESSION_CHOOSER, ".aspx", ".aspx", STRIP_SID_CHOOSER, STRIP_CFSESSION_CHOOSER};
    private static final Pattern[] strippers = {STRIP_USERINFO_REGEX, STRIP_WWW_REGEX, STRIP_PHPSESSION_ID_REGEX, STRIP_JSESSION_ID_REGEX, STRIP_ASPSESSION_REGEX, STRIP_ASPSESSION2_REGEX, STRIP_ASPSESSION3_REGEX, STRIP_SID_REGEX, STRIP_CFSESSION_REGEX};

    protected boolean doStripRegexMatch(StringBuilder sb, Matcher matcher) {
        if (matcher == null || !matcher.matches()) {
            return false;
        }
        sb.delete(matcher.start(1), matcher.end(1));
        return true;
    }

    @Override // org.archive.wayback.UrlCanonicalizer
    public String urlStringToKey(String str) throws URIException {
        String str2;
        if (str.startsWith("dns:")) {
            return str;
        }
        String canonicalize = canonicalize(str);
        String urlToScheme = UrlOperations.urlToScheme(canonicalize);
        if (urlToScheme != null) {
            canonicalize = canonicalize.substring(urlToScheme.length());
        } else {
            urlToScheme = "http://";
        }
        String str3 = -1 == canonicalize.indexOf("/") ? urlToScheme + canonicalize + "/" : urlToScheme + canonicalize;
        try {
            UsableURI usableURIFactory = UsableURIFactory.getInstance(str3);
            usableURIFactory.setPath(usableURIFactory.getPath());
            UsableURI usableURIFactory2 = UsableURIFactory.getInstance(usableURIFactory.getURI());
            String replace = usableURIFactory2.getEscapedPath().replace(UsableURIFactory.ESCAPED_SPACE, "+");
            while (true) {
                str2 = replace;
                if (!str2.contains("//")) {
                    break;
                }
                replace = str2.replace("//", "/");
            }
            StringBuilder sb = new StringBuilder(str3.length());
            sb.append(usableURIFactory2.getHostBasename());
            if (usableURIFactory2.getPort() != UrlOperations.schemeToDefaultPort(urlToScheme) && usableURIFactory2.getPort() != -1) {
                sb.append(":").append(usableURIFactory2.getPort());
            }
            sb.append(str2);
            if (usableURIFactory2.getEscapedQuery() != null) {
                sb.append(LocationInfo.NA).append(usableURIFactory2.getEscapedQuery());
            }
            return sb.toString();
        } catch (StringIndexOutOfBoundsException e) {
            LOGGER.warning(e.getMessage() + WARCConstants.COLON_SPACE + str3);
            return str3;
        }
    }

    public String canonicalize(String str) {
        if (str == null || str.length() <= 0) {
            return str;
        }
        String lowerCase = str.toLowerCase();
        StringBuilder sb = new StringBuilder(lowerCase);
        boolean z = false;
        for (int i = 0; i < choosers.length; i++) {
            if (sb.indexOf(choosers[i]) != -1) {
                z |= doStripRegexMatch(sb, strippers[i].matcher(sb));
            }
        }
        if (z) {
            lowerCase = sb.toString();
        }
        int lastIndexOf = lowerCase.lastIndexOf(63);
        if (lastIndexOf > 0) {
            if (lastIndexOf == lowerCase.length() - 1) {
                lowerCase = lowerCase.substring(0, lowerCase.length() - 1);
            } else if (lowerCase.charAt(lastIndexOf + 1) == '&') {
                lowerCase = lowerCase.length() == lastIndexOf + 2 ? lowerCase.substring(0, lowerCase.length() - 2) : lowerCase.substring(0, lastIndexOf + 1) + lowerCase.substring(lastIndexOf + 2);
            } else if (lowerCase.charAt(lowerCase.length() - 1) == '&') {
                lowerCase = lowerCase.substring(0, lowerCase.length() - 1);
            }
        }
        return lowerCase;
    }

    private static void USAGE() {
        System.err.println("Usage: [-f FIELD] [-d DELIM]");
        System.exit(3);
    }

    public static void main(String[] strArr) {
        AggressiveUrlCanonicalizer aggressiveUrlCanonicalizer = new AggressiveUrlCanonicalizer();
        int i = 0;
        ArrayList arrayList = new ArrayList();
        long j = 0;
        boolean z = false;
        String str = " ";
        while (i < strArr.length) {
            String str2 = strArr[i];
            if (str2.compareTo("-cdx") == 0) {
                z = true;
                i++;
            } else {
                if (i == strArr.length - 1) {
                    USAGE();
                }
                String str3 = strArr[i + 1];
                if (str2.compareTo("-f") == 0) {
                    arrayList.add(new Integer(str3));
                } else if (str2.compareTo("-d") == 0) {
                    str = str3;
                } else {
                    USAGE();
                }
                i += 2;
            }
        }
        if (arrayList.size() == 0) {
            arrayList.add(new Integer(1));
        }
        int[] iArr = new int[arrayList.size()];
        for (int i2 = 0; i2 < arrayList.size(); i2++) {
            iArr[i2] = ((Integer) arrayList.get(i2)).intValue() - 1;
        }
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(System.in, ByteOp.UTF8));
        StringBuilder sb = new StringBuilder();
        String str4 = null;
        while (true) {
            try {
                str4 = bufferedReader.readLine();
            } catch (IOException e) {
                e.printStackTrace();
                System.exit(1);
            }
            if (str4 == null) {
                return;
            }
            j++;
            if (z && str4.startsWith(CDX_PREFIX)) {
                System.out.println(str4);
            } else {
                String[] split = str4.split(str);
                for (int i3 : iArr) {
                    if (i3 >= split.length) {
                        System.err.println("Invalid line " + j + " (" + str4 + ") skipped");
                    } else {
                        try {
                            split[i3] = aggressiveUrlCanonicalizer.urlStringToKey(split[i3]);
                        } catch (StringIndexOutOfBoundsException e2) {
                            System.err.println("Invalid URL in line " + j + " (" + str4 + ") skipped (" + split[i3] + DefaultExpressionEngine.DEFAULT_INDEX_END);
                            e2.printStackTrace();
                        } catch (URIException e3) {
                            System.err.println("Invalid URL in line " + j + " (" + str4 + ") skipped (" + split[i3] + DefaultExpressionEngine.DEFAULT_INDEX_END);
                            e3.printStackTrace();
                        }
                    }
                }
                sb.setLength(0);
                for (int i4 = 0; i4 < split.length; i4++) {
                    sb.append(split[i4]);
                    if (i4 < split.length - 1) {
                        sb.append(str);
                    }
                }
                System.out.println(sb.toString());
            }
        }
    }

    @Override // org.archive.wayback.UrlCanonicalizer
    public boolean isSurtForm() {
        return false;
    }
}
