/*
 * Decompiled with CFR 0.152.
 */
package com.hankcs.hanlp.classification.utilities;

import com.hankcs.hanlp.corpus.io.IOUtil;
import com.hankcs.hanlp.seg.common.Term;
import com.hankcs.hanlp.tokenizer.NotionalTokenizer;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.TreeMap;

public class TextProcessUtility {
    public static String preprocess(String text) {
        return text.replaceAll("\\p{P}", " ").replaceAll("\\s+", " ").toLowerCase(Locale.getDefault());
    }

    public static String[] extractKeywords(String text) {
        List<Term> termList = NotionalTokenizer.segment(text);
        String[] wordArray = new String[termList.size()];
        Iterator<Term> iterator = termList.iterator();
        for (int i = 0; i < wordArray.length; ++i) {
            wordArray[i] = iterator.next().word;
        }
        return wordArray;
    }

    public static Map<String, Integer> getKeywordCounts(String[] keywordArray) {
        HashMap<String, Integer> counts = new HashMap<String, Integer>();
        for (int i = 0; i < keywordArray.length; ++i) {
            Integer counter = (Integer)counts.get(keywordArray[i]);
            if (counter == null) {
                counter = 0;
            }
            counter = counter + 1;
            counts.put(keywordArray[i], counter);
        }
        return counts;
    }

    public static Map<String, String[]> loadCorpus(String path) {
        TreeMap<String, String[]> dataSet = new TreeMap<String, String[]>();
        File root = new File(path);
        File[] folders = root.listFiles();
        if (folders == null) {
            return null;
        }
        for (File folder : folders) {
            File[] files;
            if (folder.isFile() || (files = folder.listFiles()) == null) continue;
            String[] documents = new String[files.length];
            for (int i = 0; i < files.length; ++i) {
                documents[i] = IOUtil.readTxt(files[i].getAbsolutePath());
            }
            dataSet.put(folder.getName(), documents);
        }
        return dataSet;
    }

    public static Map<String, String[]> loadCorpusWithException(String folderPath, String charsetName) throws IOException {
        if (folderPath == null) {
            throw new IllegalArgumentException("\u53c2\u6570 folderPath == null");
        }
        File root = new File(folderPath);
        if (!root.exists()) {
            throw new IllegalArgumentException(String.format("\u76ee\u5f55 %s \u4e0d\u5b58\u5728", root.getAbsolutePath()));
        }
        if (!root.isDirectory()) {
            throw new IllegalArgumentException(String.format("\u76ee\u5f55 %s \u4e0d\u662f\u4e00\u4e2a\u76ee\u5f55", root.getAbsolutePath()));
        }
        TreeMap<String, String[]> dataSet = new TreeMap<String, String[]>();
        File[] folders = root.listFiles();
        if (folders == null) {
            return null;
        }
        for (File folder : folders) {
            File[] files;
            if (folder.isFile() || (files = folder.listFiles()) == null) continue;
            String[] documents = new String[files.length];
            for (int i = 0; i < files.length; ++i) {
                documents[i] = TextProcessUtility.readTxt(files[i], charsetName);
            }
            dataSet.put(folder.getName(), documents);
        }
        return dataSet;
    }

    public static String readTxt(File file, String charsetName) throws IOException {
        int len;
        FileInputStream is = new FileInputStream(file);
        byte[] targetArray = new byte[is.available()];
        for (int off = 0; (len = is.read(targetArray, off, targetArray.length - off)) != -1 && off < targetArray.length; off += len) {
        }
        is.close();
        return new String(targetArray, charsetName);
    }

    public static Map<String, String[]> loadCorpusWithException(String corpusPath) throws IOException {
        return TextProcessUtility.loadCorpusWithException(corpusPath, "UTF-8");
    }
}

