package de.duehl.basics.text.extract.tools;

/*
 * Copyright 2019 Christian Dühl. All rights reserved.
 *
 * This program is free software. You can redistribute it and/or
 * modify it under the same terms as perl:
 *
 * general:  http://dev.perl.org/licenses/
 * GPL:      http://dev.perl.org/licenses/gpl1.html
 * artistic: http://dev.perl.org/licenses/artistic.html
 */

import java.util.ArrayList;
import java.util.List;

import de.duehl.basics.collections.CollectionsHelper;

/**
 * Diese Klasse bereinigt für den Prozess, der brauchbare Worte aus einem Text extrahiert, die
 * generierten Worte.
 *
 * @version 1.01     2019-04-25
 * @author Christian Dühl
 */

public class UnwantedWordsRemover {

    private static final List<String> DIGITS = CollectionsHelper.buildListFrom(
            "0",
            "1", "2", "3",
            "4", "5", "6",
            "7", "8", "9");

    /** Die zu bereinigenden Worte. */
    private final List<String> words;

    /** Die bereinigten Worte. */
    private final List<String> cleanedWords;

    /**
     * Konstruktor.
     *
     * @param words
     *            Die zu bereinigenden Worte.
     */
    public UnwantedWordsRemover(List<String> words) {
        this.words = words;
        cleanedWords = new ArrayList<>();

        removeUnwantedWords();
    }

    private List<String> removeUnwantedWords() {
        for (String word : words) {
            if (wordIsOk(word)) {
                cleanedWords.add(word);
            }
        }

        return cleanedWords;
    }

    private boolean wordIsOk(String word) {
        return !wordIsNotOk(word);
    }

    private boolean wordIsNotOk(String word) {
        return wordIsIncomplete(word)
                || wordContainsNumber(word)
                || wordContainsAt(word);
    }

    private boolean wordIsIncomplete(String word) {
        return word.startsWith("-")
                || word.endsWith("-")
                || word.length() < 2; // das nur im Deutschen! Englisch: I, o ...
    }

    private boolean wordContainsNumber(String word) {
        for (String digit : DIGITS) {
            if (word.contains(digit)) {
                return true;
            }
        }

        return false;
    }

    private boolean wordContainsAt(String word) {
        return word.contains("@");
    }

    /** Getter für die bereinigten Worte. */
    public List<String> getCleanedWords() {
        return cleanedWords;
    }

}
