package org.ujmp.core.text;

import ch.qos.logback.classic.spi.CallerData;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;
import org.aspectj.apache.bcel.Constants;
import org.ujmp.core.DenseMatrix;
import org.ujmp.core.Matrix;
import org.ujmp.core.collections.list.FastArrayList;
import org.ujmp.core.doublematrix.DenseDoubleMatrix2D;
import org.ujmp.core.doublematrix.impl.DefaultSparseDoubleMatrix;
import org.ujmp.core.util.VerifyUtil;

/* loaded from: input_file:BOOT-INF/lib/ujmp-core-0.3.0.jar:org/ujmp/core/text/TextUtil.class */
public abstract class TextUtil {
    public static final String ALPHA_NUMERIC_REGEX = "^[A-Za-z0-9]+$";
    public static final String HAS_DASH_REGEX = "^.+-.+$";
    public static final String INIT_DASH_REGEX = "^-.+$";
    public static final String END_DASH_REGEX = "^-.+$";
    public static final String PUNCTUATION_REGEX = "^[,.:;!?]$";
    public static final String ONE_QUESTION_MARK_REGEX = "^[?]$";
    public static final String TWO_QUESTION_MARKS_REGEX = "^[??]$";
    public static final String THREE_QUESTION_MARKS_REGEX = "^[???]$";
    public static final String MULTIPLE_QUESTION_MARKS_REGEX = "^[?][?]+$";
    public static final String ONE_EXCLAMATION_MARK_REGEX = "^[!]$";
    public static final String TWO_EXCLAMATION_MARKS_REGEX = "^[!!]$";
    public static final String THREE_EXCLAMATION_MARKS_REGEX = "^[!!!]$";
    public static final String MULTIPLE_EXCLAMATION_MARKS_REGEX = "^[!][!]+$";
    public static final String QUESTION_EXCLAMATION_MARK_REGEX = "^[?][!]$";
    public static final String EXCLAMATION_QUESTION_MARK_REGEX = "^[!][?]$";
    public static final String INIT_CAPS_REGEX = "^[A-Z].+$";
    public static final String INIT_CAPS_ALPHA_REGEX = "^[A-Z][a-z]+$";
    public static final String ONE_CAP_REGEX = "^[A-Z]$";
    public static final String TWO_CAPS_REGEX = "^[A-Z][A-Z]$";
    public static final String THREE_CAPS_REGEX = "^[A-Z][A-Z][A-Z]$";
    public static final String FOUR_CAPS_REGEX = "^[A-Z][A-Z][A-Z][A-Z]$";
    public static final String ALL_CAPS_REGEX = "^[A-Z]+$";
    public static final String CAPS_MIX_REGEX = "^[A-Za-z]+$";
    public static final String ONE_DIGIT_REGEX = "^[0-9]$";
    public static final String TWO_DIGITS_REGEX = "^[0-9][0-9]$";
    public static final String THREE_DIGITS_REGEX = "^[0-9][0-9][0-9]$";
    public static final String FOUR_DIGITS_REGEX = "^[0-9][0-9][0-9][0-9]$";
    public static final String HAS_DIGIT_REGEX = "^.+[0-9].+$";
    public static final String POSITIVE_INTEGER_REGEX = "^[0-9]+$";
    public static final String NEGATIVE_INTEGER_REGEX = "^-[0-9]+$";
    public static final String FLOATING_POINT_NUMBER_REGEX = "^[-+]?[0-9]*\\.?[0-9]+$";
    public static final String EXP_NUMBER_REGEX = "^[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?$";
    public static final String ROMAN_NUMBER_SMALL_REGEX = "^[ivxdlcm]+$";
    public static final String ROMAN_NUMBER_CAPITAL_REGEX = "^[IVXDLCM]+$";
    public static final String SINGLE_INITIAL_REGEX = "^[a-zA-Z]\\.$";
    public static final String IN_PARENTHESES_REGEX = "^(.+)$";
    public static final String OBD_REGEX = "^[PBCU][0-9A-F][0-9A-F][0-9A-F][0-9A-F]$";
    public static final String YEAR_REGEX = "^[12][0-9][0-9][0-9]$";
    public static final String HEX_REGEX = "^[0-9A-Fa-f][0-9A-Fa-f]+$";
    public static final String EMAIL_REGEX = "^([a-z0-9_\\.-]+)@([\\da-z\\.-]+)\\.([a-z\\.]{2,6})$";
    public static final String IP_REGEX = "^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$";
    public static final String HTML_REGEX = "^<([a-z]+)([^<]+)*(?:>(.*)<\\/\\1>|\\s+\\/>)$";
    public static final String URL_REGEX = "^(https?:\\/\\/)?([\\da-z\\.-]+)\\.([a-z\\.]{2,6})([\\/\\w \\.-]*)*\\/?$";

    public static final DenseDoubleMatrix2D getCharacterFrequencies(String str, char... cArr) {
        String lowerCase = str.toLowerCase();
        boolean[] createCharacterVector = createCharacterVector(cArr);
        double[] dArr = new double[256];
        int length = lowerCase.length();
        while (true) {
            length--;
            if (length == -1) {
                return Matrix.Factory.linkToArray(dArr);
            }
            char charAt = lowerCase.charAt(length);
            if (charAt < 256 && createCharacterVector[charAt]) {
                dArr[charAt] = dArr[charAt] + 1.0d;
            }
        }
    }

    /* JADX WARN: Multi-variable type inference failed */
    public static final List<String> splitLineIntoSentences(String str) {
        VerifyUtil.verifyNotNull(str, "text cannot be null");
        VerifyUtil.verifyTrue(str.split("\n").length == 1, "text must be in one line");
        StringTokenizer stringTokenizer = new StringTokenizer(str, ".:;!?", true);
        FastArrayList fastArrayList = new FastArrayList();
        String str2 = null;
        String str3 = null;
        String str4 = null;
        while (stringTokenizer.hasMoreTokens()) {
            str2 = str3;
            str3 = str4;
            str4 = stringTokenizer.nextToken();
            if (str3 == null || !str3.trim().isEmpty()) {
                if (str2 != null && str3 != null && str4 != null) {
                    if (".".equals(str3)) {
                        if (fastArrayList.isEmpty()) {
                            fastArrayList.add(str2 + str3);
                        } else {
                            fastArrayList.set(fastArrayList.size() - 1, ((String) fastArrayList.get(fastArrayList.size() - 1)) + str3);
                        }
                    } else if (":".equals(str3)) {
                        if (fastArrayList.isEmpty()) {
                            fastArrayList.add(str2 + str3);
                        } else {
                            fastArrayList.set(fastArrayList.size() - 1, ((String) fastArrayList.get(fastArrayList.size() - 1)) + str3);
                        }
                    } else if (";".equals(str3)) {
                        if (fastArrayList.isEmpty()) {
                            fastArrayList.add(str2 + str3);
                        } else {
                            fastArrayList.set(fastArrayList.size() - 1, ((String) fastArrayList.get(fastArrayList.size() - 1)) + str3);
                        }
                    } else if ("!".equals(str3)) {
                        if (fastArrayList.isEmpty()) {
                            fastArrayList.add(str2 + str3);
                        } else {
                            fastArrayList.set(fastArrayList.size() - 1, ((String) fastArrayList.get(fastArrayList.size() - 1)) + str3);
                        }
                    } else if (CallerData.NA.equals(str3)) {
                        if (fastArrayList.isEmpty()) {
                            fastArrayList.add(str2 + str3);
                        } else {
                            fastArrayList.set(fastArrayList.size() - 1, ((String) fastArrayList.get(fastArrayList.size() - 1)) + str3);
                        }
                    } else if (fastArrayList.isEmpty()) {
                        fastArrayList.add(str3);
                    } else if (endsWithAbbreviation((String) fastArrayList.get(fastArrayList.size() - 1))) {
                        fastArrayList.set(fastArrayList.size() - 1, ((String) fastArrayList.get(fastArrayList.size() - 1)) + str3);
                    } else {
                        fastArrayList.add(str3);
                    }
                }
            }
        }
        if (str2 == null || str3 == null || str4 == null) {
            if (str2 != null && str3 != null && str4 == null) {
                fastArrayList.add(str2);
                fastArrayList.add(str3);
            } else if (str2 != null && str3 == null && str4 == null) {
                fastArrayList.add(str2);
            }
        } else if (".".equals(str4)) {
            fastArrayList.set(fastArrayList.size() - 1, ((String) fastArrayList.get(fastArrayList.size() - 1)) + str4);
        } else if (":".equals(str4)) {
            fastArrayList.set(fastArrayList.size() - 1, ((String) fastArrayList.get(fastArrayList.size() - 1)) + str4);
        } else if (";".equals(str4)) {
            fastArrayList.set(fastArrayList.size() - 1, ((String) fastArrayList.get(fastArrayList.size() - 1)) + str4);
        } else if (CallerData.NA.equals(str4)) {
            fastArrayList.set(fastArrayList.size() - 1, ((String) fastArrayList.get(fastArrayList.size() - 1)) + str4);
        } else if ("!".equals(str4)) {
            fastArrayList.set(fastArrayList.size() - 1, ((String) fastArrayList.get(fastArrayList.size() - 1)) + str4);
        }
        return fastArrayList;
    }

    public static final DefaultTextBlock splitTextIntoObjects(String str) {
        DefaultTextBlock defaultTextBlock = new DefaultTextBlock(new TextSentence[0]);
        int i = 0;
        for (String str2 : splitLineIntoSentences(str.replaceAll("\r\n", " ").replaceAll("\n", " "))) {
            DefaultTextSentence defaultTextSentence = new DefaultTextSentence(new TextToken[0]);
            int i2 = i;
            i++;
            defaultTextSentence.setMetaData("Id", Integer.valueOf(i2));
            int i3 = 0;
            Iterator<String> it = splitSentenceIntoTokens(str2, 0).iterator();
            while (it.hasNext()) {
                DefaultTextToken defaultTextToken = new DefaultTextToken(it.next());
                int i4 = i3;
                i3++;
                defaultTextToken.put("Id", Integer.valueOf(i4));
                defaultTextSentence.add(defaultTextToken);
            }
            defaultTextBlock.add(defaultTextSentence);
        }
        return defaultTextBlock;
    }

    public static final List<List<String>> createWordTrigrams(String str) {
        FastArrayList fastArrayList = new FastArrayList();
        Iterator<String> it = splitTextIntoLines(str).iterator();
        while (it.hasNext()) {
            Iterator<String> it2 = splitLineIntoSentences(it.next()).iterator();
            while (it2.hasNext()) {
                fastArrayList.addAll(createWordTrigrams(splitSentenceIntoTokens(it2.next(), 3)));
            }
        }
        return fastArrayList;
    }

    public static final List<String> splitTextIntoLines(String str) {
        VerifyUtil.verifyNotNull(str, "text cannot be null");
        return Arrays.asList(str.split("\n"));
    }

    public static final List<List<String>> createWordBigrams(String str) {
        FastArrayList fastArrayList = new FastArrayList();
        Iterator<String> it = splitTextIntoLines(str).iterator();
        while (it.hasNext()) {
            Iterator<String> it2 = splitLineIntoSentences(it.next()).iterator();
            while (it2.hasNext()) {
                fastArrayList.addAll(createWordBigrams(splitSentenceIntoTokens(it2.next(), 2)));
            }
        }
        return fastArrayList;
    }

    public static final List<String> createWordUnigrams(String str, int i) {
        FastArrayList fastArrayList = new FastArrayList();
        Iterator<String> it = splitTextIntoLines(str).iterator();
        while (it.hasNext()) {
            Iterator<String> it2 = splitLineIntoSentences(it.next()).iterator();
            while (it2.hasNext()) {
                fastArrayList.addAll(splitSentenceIntoTokens(it2.next(), i));
            }
        }
        return fastArrayList;
    }

    public static Map<List<String>, Integer> getWordBigramCounts(String str) {
        List<List<String>> createWordBigrams = createWordBigrams(str);
        HashMap hashMap = new HashMap();
        for (List<String> list : createWordBigrams) {
            Integer num = (Integer) hashMap.get(list);
            hashMap.put(list, Integer.valueOf(num == null ? 1 : num.intValue() + 1));
        }
        return hashMap;
    }

    public static Map<String, Integer> getWordUnigramCounts(String str, int i) {
        List<String> createWordUnigrams = createWordUnigrams(str, i);
        HashMap hashMap = new HashMap();
        for (String str2 : createWordUnigrams) {
            Integer num = (Integer) hashMap.get(str2);
            hashMap.put(str2, Integer.valueOf(num == null ? 1 : num.intValue() + 1));
        }
        return hashMap;
    }

    public static final List<String> splitSentenceIntoTokens(String str, int i) {
        VerifyUtil.verifyNotNull(str, "text cannot be null");
        VerifyUtil.verifyTrue(str.split("\n").length == 1, "text must be in one line");
        str.replace((char) 160, ' ');
        StringTokenizer stringTokenizer = new StringTokenizer(str, "  .;,、،:&\\⁄/”“‘\"―—–‒‐-!?{}()[]", true);
        FastArrayList fastArrayList = new FastArrayList();
        for (int i2 = 1; i2 < i; i2++) {
            fastArrayList.add("".intern());
        }
        while (stringTokenizer.hasMoreTokens()) {
            String trim = stringTokenizer.nextToken().trim();
            if (!trim.isEmpty() && !trim.equals(" ")) {
                fastArrayList.add(trim.intern());
            }
        }
        for (int i3 = 1; i3 < i; i3++) {
            fastArrayList.add("".intern());
        }
        return fastArrayList;
    }

    public static final List<List<String>> createWordBigrams(List<String> list) {
        FastArrayList fastArrayList = new FastArrayList();
        for (int i = 0; i < list.size() - 1; i++) {
            String str = list.get(i);
            String str2 = list.get(i + 1);
            FastArrayList fastArrayList2 = new FastArrayList(2);
            fastArrayList2.add(str);
            fastArrayList2.add(str2);
            fastArrayList.add(fastArrayList2);
        }
        return fastArrayList;
    }

    public static final List<List<String>> createWordTrigrams(List<String> list) {
        FastArrayList fastArrayList = new FastArrayList();
        for (int i = 0; i < list.size() - 2; i++) {
            String str = list.get(i);
            String str2 = list.get(i + 1);
            String str3 = list.get(i + 2);
            FastArrayList fastArrayList2 = new FastArrayList(3);
            fastArrayList2.add(str);
            fastArrayList2.add(str2);
            fastArrayList2.add(str3);
            fastArrayList.add(fastArrayList2);
        }
        return fastArrayList;
    }

    public static boolean endsWithAbbreviation(String str) {
        String lowerCase = str.toLowerCase();
        return lowerCase.endsWith(" 0.") || lowerCase.endsWith(" 1.") || lowerCase.endsWith(" 2.") || lowerCase.endsWith(" 3.") || lowerCase.endsWith(" 4.") || lowerCase.endsWith(" 5.") || lowerCase.endsWith(" 6.") || lowerCase.endsWith(" 7.") || lowerCase.endsWith(" 8.") || lowerCase.endsWith(" 9.") || lowerCase.endsWith("10.") || lowerCase.endsWith("11.") || lowerCase.endsWith("12.") || lowerCase.endsWith("13.") || lowerCase.endsWith("14.") || lowerCase.endsWith("15.") || lowerCase.endsWith("16.") || lowerCase.endsWith("17.") || lowerCase.endsWith("18.") || lowerCase.endsWith("19.") || lowerCase.endsWith("20.") || lowerCase.endsWith("21.") || lowerCase.endsWith("22.") || lowerCase.endsWith("23.") || lowerCase.endsWith("24.") || lowerCase.endsWith("25.") || lowerCase.endsWith("26.") || lowerCase.endsWith("27.") || lowerCase.endsWith("28.") || lowerCase.endsWith("29.") || lowerCase.endsWith("30.") || lowerCase.endsWith("31.") || lowerCase.endsWith("32.") || lowerCase.endsWith("33.") || lowerCase.endsWith("34.") || lowerCase.endsWith("35.") || lowerCase.endsWith("36.") || lowerCase.endsWith("37.") || lowerCase.endsWith("38.") || lowerCase.endsWith("39.") || lowerCase.endsWith(" 0:") || lowerCase.endsWith(" 1:") || lowerCase.endsWith(" 2:") || lowerCase.endsWith(" 3:") || lowerCase.endsWith(" 4:") || lowerCase.endsWith(" 5:") || lowerCase.endsWith(" 6:") || lowerCase.endsWith(" 7:") || lowerCase.endsWith(" 8:") || lowerCase.endsWith(" 9:") || lowerCase.endsWith(" a.") || lowerCase.endsWith(" b.") || lowerCase.endsWith(" c.") || lowerCase.endsWith(" d.") || lowerCase.endsWith(" e.") || lowerCase.endsWith(" f.") || lowerCase.endsWith(" g.") || lowerCase.endsWith(" h.") || lowerCase.endsWith(" i.") || lowerCase.endsWith(" j.") || lowerCase.endsWith(" k.") || lowerCase.endsWith(" l.") || lowerCase.endsWith(" m.") || lowerCase.endsWith(" n.") || lowerCase.endsWith(" o.") || lowerCase.endsWith(" p.") || lowerCase.endsWith(" q.") || lowerCase.endsWith(" r.") || lowerCase.endsWith(" s.") || lowerCase.endsWith(" t.") || lowerCase.endsWith(" u.") || lowerCase.endsWith(" v.") || lowerCase.endsWith(" w.") || lowerCase.endsWith(" x.") || lowerCase.endsWith(" y.") || lowerCase.endsWith(" z.") || lowerCase.endsWith(" ä.") || lowerCase.endsWith(" ö.") || lowerCase.endsWith(" ü.") || lowerCase.endsWith(" ß.") || lowerCase.endsWith(".a.") || lowerCase.endsWith(".b.") || lowerCase.endsWith(".c.") || lowerCase.endsWith(".d.") || lowerCase.endsWith(".e.") || lowerCase.endsWith(".f.") || lowerCase.endsWith(".g.") || lowerCase.endsWith(".h.") || lowerCase.endsWith(".i.") || lowerCase.endsWith(".j.") || lowerCase.endsWith(".k.") || lowerCase.endsWith(".l.") || lowerCase.endsWith(".m.") || lowerCase.endsWith(".n.") || lowerCase.endsWith(".o.") || lowerCase.endsWith(".p.") || lowerCase.endsWith(".q.") || lowerCase.endsWith(".r.") || lowerCase.endsWith(".s.") || lowerCase.endsWith(".t.") || lowerCase.endsWith(".u.") || lowerCase.endsWith(".v.") || lowerCase.endsWith(".w.") || lowerCase.endsWith(".x.") || lowerCase.endsWith(".y.") || lowerCase.endsWith(".z.") || lowerCase.endsWith(".ä.") || lowerCase.endsWith(".ö.") || lowerCase.endsWith(".ü.") || lowerCase.endsWith(".ß.") || lowerCase.endsWith(" ca.") || lowerCase.endsWith(" vs.") || lowerCase.endsWith(" rep.") || lowerCase.endsWith(" etc.") || lowerCase.endsWith(" usw.") || lowerCase.endsWith(" resp.") || lowerCase.endsWith(" incl.") || lowerCase.endsWith(" inkl.") || lowerCase.endsWith(" insges.") || lowerCase.endsWith(" zyl.") || lowerCase.endsWith(" cyl.") || lowerCase.endsWith(" dr.") || lowerCase.endsWith(" prof.") || lowerCase.endsWith(" gr.") || lowerCase.endsWith(" ppm.") || lowerCase.endsWith(" ggf.");
    }

    private static final boolean[] createCharacterVector(char... cArr) {
        boolean[] zArr = new boolean[256];
        if (cArr.length == 0) {
            Arrays.fill(zArr, true);
        } else {
            for (char c : cArr) {
                if (c < 256) {
                    zArr[c] = true;
                }
            }
        }
        return zArr;
    }

    public static final Matrix getCharacterBigramFrequencies(String str, char... cArr) {
        String str2 = " " + str.toLowerCase() + " ";
        boolean[] createCharacterVector = createCharacterVector(cArr);
        DefaultSparseDoubleMatrix defaultSparseDoubleMatrix = new DefaultSparseDoubleMatrix(Constants.EXCEPTION_THROWER, 1);
        int length = str2.length();
        while (true) {
            length--;
            if (length == 0) {
                return defaultSparseDoubleMatrix;
            }
            char charAt = str2.charAt(length - 1);
            char charAt2 = str2.charAt(length);
            if (charAt < 256 && charAt2 < 256 && createCharacterVector[charAt] && createCharacterVector[charAt2]) {
                int i = (charAt * 256) + charAt2;
                defaultSparseDoubleMatrix.setAsDouble(defaultSparseDoubleMatrix.getAsDouble(i, 0) + 1.0d, i, 0);
            }
        }
    }

    public static final Matrix getCharacterTrigramFrequencies(String str, char... cArr) {
        String str2 = " " + str.toLowerCase() + " ";
        boolean[] createCharacterVector = createCharacterVector(cArr);
        DefaultSparseDoubleMatrix defaultSparseDoubleMatrix = new DefaultSparseDoubleMatrix(org.h2.engine.Constants.DEFAULT_MAX_LOG_SIZE, 1);
        int length = str2.length();
        while (true) {
            length--;
            if (length == 0) {
                return defaultSparseDoubleMatrix;
            }
            char charAt = str2.charAt(length - 2);
            char charAt2 = str2.charAt(length - 1);
            char charAt3 = str2.charAt(length);
            if (charAt < 256 && charAt2 < 256 && charAt3 < 256 && createCharacterVector[charAt] && createCharacterVector[charAt2] && createCharacterVector[charAt3]) {
                int i = (charAt * 0) + (charAt2 * 256) + charAt3;
                defaultSparseDoubleMatrix.setAsDouble(defaultSparseDoubleMatrix.getAsDouble(i, 0) + 1.0d, i, 0);
            }
        }
    }

    public static final Matrix createBagOfWordsVector(String str, List<String> list) {
        DenseMatrix zeros = Matrix.Factory.zeros(list.size(), 1L);
        StringTokenizer stringTokenizer = new StringTokenizer(str, " \t\n\r\f,.:;?![]'");
        while (stringTokenizer.hasMoreElements()) {
            long indexOf = list.indexOf(stringTokenizer.nextElement());
            zeros.setAsDouble(zeros.getAsDouble(indexOf, 0) + 1.0d, indexOf, 0);
        }
        return zeros;
    }

    public static Collection<TextToken> convertSentenceToTextTokens(String str) {
        List<String> splitSentenceIntoTokens = splitSentenceIntoTokens(str, 1);
        ArrayList arrayList = new ArrayList();
        Iterator<String> it = splitSentenceIntoTokens.iterator();
        while (it.hasNext()) {
            arrayList.add(new DefaultTextToken(it.next()));
        }
        return arrayList;
    }

    public static Collection<TextSentence> convertToTextBlockToSentences(String str) {
        List<String> splitTextIntoSentences = splitTextIntoSentences(str);
        ArrayList arrayList = new ArrayList();
        Iterator<String> it = splitTextIntoSentences.iterator();
        while (it.hasNext()) {
            arrayList.add(new DefaultTextSentence(it.next()));
        }
        return arrayList;
    }

    public static List<String> splitTextIntoSentences(String str) {
        return splitLineIntoSentences(str.replaceAll("\n", " "));
    }

    public static Matrix stringToVector(String str) {
        return stringToVector(str, 131072);
    }

    public static Matrix stringToVector(String str, int i) {
        DenseMatrix zeros = Matrix.Factory.zeros(i, 1L);
        StringTokenizer stringTokenizer = new StringTokenizer(str, " \t\n\r\f,.:;?![]'");
        while (stringTokenizer.hasMoreElements()) {
            long abs = Math.abs(stringTokenizer.nextElement().toString().toLowerCase().hashCode()) % i;
            zeros.setAsDouble(zeros.getAsDouble(abs, 0) + 1.0d, abs, 0);
        }
        return zeros;
    }
}
